root/main/trunk/greenstone2/perllib/classify/Phind.pm @ 27904

Revision 27904, 41.3 KB (checked in by ak19, 6 years ago)

Replacing calls to deprecated utils:: subroutines with FileUtils:: equivalents.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32package Phind;
33
34use BaseClassifier;
35use FileUtils;
36use util;
37use ghtml;
38use unicode;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43my @removedirs = ();
44
45my %wanted_index_files = ('td'=>1,
46              't'=>1,
47              'ti'=>1,
48              'tl'=>1,
49              'tsd'=>1,
50              'idb'=>1,
51              'ib1'=>1,
52              'ib2'=>1,
53              'ib3'=>1,
54              'i'=>1,
55              'il'=>1,
56              'w'=>1,
57              'wa'=>1);
58
59sub BEGIN {
60    @Phind::ISA = ('BaseClassifier');
61}
62
63sub END {
64   
65    # Tidy up stray files - we do this here as there's some weird problem
66    # preventing us from doing it in the get_classify_info() function (on
67    # windows at least) where the close() appears to fail on txthandle and
68    # dochandle, thus preventing us from deleting those files
69
70    foreach my $dir (@removedirs) {
71    if (-d $dir && opendir (DIR, $dir)) {
72        my @files = readdir DIR;
73        closedir DIR;
74   
75        foreach my $file (@files) {
76        next if $file =~ /^\.\.?$/;
77        my ($suffix) = $file =~ /\.([^\.]+)$/;
78        if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
79            # delete it!
80            &FileUtils::removeFiles (&FileUtils::filenameConcatenate ($dir, $file));
81        }
82        }
83    }
84    }
85}
86
87my $arguments =
88    [ { 'name' => "text",
89    'desc' => "{Phind.text}",
90    'type' => "string",
91    'deft' => "section:Title,section:text",
92    'reqd' => "no" },
93      { 'name' => "title",
94    'desc' => "{Phind.title}",
95    'type' => "metadata",
96    'deft' => "Title",
97    'reqd' => "no" },
98      { 'name' => "buttonname",
99    'desc' => "{BasClas.buttonname}",
100    'type' => "string",
101    'deft' => "Phrase",
102    'reqd' => "no" },
103      { 'name' => "language",
104    'desc' => "{Phind.language}",
105    'type' => "string",
106    'deft' => "en",
107    'reqd' => "no" },
108      { 'name' => "savephrases",
109    'desc' => "{Phind.savephrases}",
110    'type' => "string",
111    'deft' => "",
112    'reqd' => "no" },
113      { 'name' => "suffixmode",
114    'desc' => "{Phind.suffixmode}",
115    'type' => "int",
116    'deft' => "1",
117    'range' => "0,1",
118    'reqd' => "no" },
119      { 'name' => "min_occurs",
120    'desc' => "{Phind.min_occurs}",
121    'type' => "int",
122    'deft' => "2",
123    'range' => "1,",
124    'reqd' => "no" },
125      { 'name' => "thesaurus",
126    'desc' => "{Phind.thesaurus}",
127    'type' => "string",
128    'deft' => "",
129    'reqd' => "no" },
130      { 'name' => "untidy",
131    'desc' => "{Phind.untidy}",
132    'type' => "flag",
133    'reqd' => "no" } ];
134
135my $options = { 'name'     => "Phind",
136        'desc'     => "{Phind.desc}",
137        'abstract' => "no",
138        'inherits' => "yes",
139        'args'     => $arguments };
140
141
142# Phrase delimiter symbols - these should be abstracted out someplace
143
144my $colstart = "COLLECTIONSTART";
145my $colend   = "COLLECTIONEND";
146my $doclimit = "DOCUMENTLIMIT";
147my $senlimit = "SENTENCELIMIT";
148my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
149
150
151# Create a new Phind browser based on collect.cfg
152
153sub new {
154    my ($class) = shift (@_);
155    my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
156    push(@$classifierslist, $class);
157
158    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
159    push(@{$hashArgOptLists->{"OptList"}},$options);
160
161    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
162
163    if ($self->{'info_only'}) {
164    # don't worry about any options etc
165    return bless $self, $class;
166    }
167
168    # Ensure the Phind generate scripts are in place
169    my $file1 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
170    $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
171    my $src = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "src", "phind", "generate");
172    if (!(-e $file1)) {
173    print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
174    exit(1);
175    }
176   
177    # things that may have ex. in them that need to be stripped off
178    $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
179    $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
180
181    # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
182   
183    $self->{"indexes"} = $self->{"text"};
184
185    # Further setup
186    $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
187    $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
188    if (! defined $self->{'builddir'}) {
189    $self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "building");
190    }
191    $self->{'total'} = 0;
192   
193    # we set phind to be rtl if language is arabic
194    if ($self->{'language'} eq "ar") {
195    $self->{'textorientation'} = "rtl";
196    }
197    # Clean out the unused keys
198    delete $self->{"text"};
199
200    return bless $self, $class;
201}
202
203
204# Initialise the Phind classifier
205
206sub init {
207    my $self = shift (@_);
208
209    # ensure we have a build directory
210    my $builddir = $self->{'builddir'};
211    die unless (-e "$builddir");
212
213    # create Phind directory
214    my $phnumber = 1;
215    my $phinddir = &FileUtils::filenameConcatenate($builddir, "phind1");
216    while (-e "$phinddir") {
217    $phnumber++;
218    $phinddir = &FileUtils::filenameConcatenate($builddir, "phind$phnumber");
219    }
220    &util::mk_dir("$phinddir");
221    $self->{'phinddir'} = $phinddir;
222    $self->{'phindnumber'} = $phnumber;
223
224    push(@removedirs, $phinddir) unless $self->{'untidy'};
225
226    # open filehandles for documents and text
227    my $clausefile =  &FileUtils::filenameConcatenate("$phinddir", "clauses");
228    &FileUtils::removeFiles($clausefile) if (-e $clausefile);
229
230    my $txthandle = 'TEXT' . $phnumber;
231    open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
232    $self->{'txthandle'} = $txthandle;
233
234    my $docfile = &FileUtils::filenameConcatenate("$phinddir", "docs.txt");
235    &FileUtils::removeFiles($docfile) if (-e $docfile);
236
237    my $dochandle = 'DOC' . $phnumber;
238    open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
239    $self->{'dochandle'} = $dochandle;
240   
241}
242
243
244# Classify each document.
245#
246# Each document is passed here in turn.  The classifier extracts the
247# text of each and stores it in the clauses file.  Document details are
248# stored in the docs.txt file.
249
250sub classify {
251    my $self = shift (@_);
252    my ($doc_obj) = @_;
253
254    my $verbosity = $self->{'verbosity'};
255    my $top_section = $doc_obj->get_top_section();
256
257    my $titlefield = $self->{'title'};
258   
259    my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
260    if (!defined($title)) {
261    $title = "";
262    print STDERR "Phind: document has no title\n";
263    }
264    print "process: $title\n" if ($verbosity > 2);
265
266    # Only consider the file if it is in the correct language
267    my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
268    my $phrlanguage = $self->{'language'};
269    return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
270
271    # record this file
272    $self->{'total'} ++;
273    # what is $file ???
274    # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
275
276    # Store document details
277    my $OID = $doc_obj->get_OID();
278    $OID = "NULL" unless defined $OID;
279    my $dochandle = $self->{'dochandle'};
280    print $dochandle "<Document>\t$OID\t$title\n";
281   
282    # Store the text occuring in this object
283
284    # output the document delimiter
285    my $txthandle = $self->{'txthandle'};
286    print $txthandle "$doclimit\n";
287
288    # iterate over the required indexes and store their text
289    my $indexes = $self->{'indexes'};
290    my $text = "";
291    my ($part, $level, $field, $section, $data, $dataref);
292
293    foreach $part (split(/,/, $indexes)) {
294
295    # Each field has a level and a data element ((e.g. document:Title)
296    ($level, $field) = split(/:/, $part);
297    die unless ($level && $field);
298   
299    # Extract the text from every section
300    # (In phind, document:text and section:text are equivalent)
301    if ($field eq "text") {
302        $data = "";
303        $section = $doc_obj->get_top_section();
304        while (defined($section)) {
305        $data .= $doc_obj->get_text($section) . "\n";
306        $section = $doc_obj->get_next_section($section);
307        }
308        $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
309    }
310   
311    # Extract a metadata field from a document
312    # (If there is more than one element of the given type, get them all.)
313    elsif ($level eq "document") {
314        $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
315        foreach $data (@$dataref) {
316        $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
317        }
318    }
319
320    # Extract metadata from every section in a document
321    elsif ($level eq "section") {
322        $data = "";
323        $section = $doc_obj->get_top_section();
324        while (defined($section)) {
325        $dataref = $doc_obj->get_metadata($section, $field);
326        $data .= join("\n", @$dataref) . "\n";
327        $section = $doc_obj->get_next_section($section);
328        }
329        $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
330    }
331   
332    # Some sort of specification which I don't understand
333    else {
334        die "Unknown level ($level) in Phind index ($part)\n";
335    }
336   
337    }
338
339    # output the text
340    $text =~ tr/\n//s;
341    print $txthandle "$text";
342}
343
344
345# Construct the classifier from the information already gathered
346#
347# When get_classify_info is called, the clauses and docs.txt files have
348# already been constructed in the Phind directory.  This function will
349# translate them into compressed, indexed MGPP files that can be read by
350# the phindcgi script.  It will also register our classifier so that it
351# shows up in the navigation bar.
352
353sub get_classify_info {
354    my $self = shift (@_);
355    my ($gli) = @_;   
356
357    close $self->{'dochandle'};
358    close $self->{'txthandle'};
359    my $verbosity = $self->{'verbosity'};
360    my $out = $self->{'outhandle'};
361    my $phinddir = $self->{'phinddir'};
362
363    my $osextra = "";
364    if ($ENV{'GSDLOS'} !~ /^windows$/i) {
365    $osextra = " -d /";
366    }
367
368    print STDERR "</Stage>\n" if $gli;
369
370    if ($verbosity) {
371    print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
372    print $out "***          in ", $self->{'phinddir'}, "\n";
373    }
374
375    print STDERR "<Stage name='Phind'>\n" if $gli;
376
377    # Construct phind indexes
378    my $suffixmode = $self->{'suffixmode'};
379    my $min_occurs = $self->{'min_occurs'};
380    my ($command, $status);
381   
382    # Generate the vocabulary, symbol statistics, and numbers file
383    # from the clauses file
384    print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
385    print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
386    &extract_vocabulary($self);
387 
388    # Use the suffix program to generate the phind/phrases file
389    print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
390    print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
391    &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
392
393    # check that we generated some files. It's not necessarily an error if
394    # we didn't (execute() would have quit on error), but we can't go on.
395    my $phrasesfile=&FileUtils::filenameConcatenate($self->{'phinddir'}, 'phrases');
396    if (! -r $phrasesfile) {
397    print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
398    print $out "\nNo phrases found for Phind classifier!\n";
399    return;
400    }   
401
402    # Create the phrase file and put phrase numbers in phind/phrases
403    print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
404    print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
405    &renumber_phrases($self);
406
407    print $out "\nCreating phrase databases\n";
408    print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
409    my $mg_input = &FileUtils::filenameConcatenate($phinddir, "pdata.txt");
410    my $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pdata");
411
412    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
413    &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
414    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
415
416    # create the mg index of words
417    print $out "\nCreating word-level search indexes\n";
418    print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
419    $mg_input = &FileUtils::filenameConcatenate($phinddir, "pword.txt");
420    $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pword");
421
422    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
423    &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
424    &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
425    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
426    &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
427    &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
428
429    &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
430    &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
431    &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
432
433    # create the mg document information database
434    print $out "\nCreating document information databases\n";
435    print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
436    $mg_input = &FileUtils::filenameConcatenate($phinddir, "docs.txt");
437    $mg_stem = &FileUtils::filenameConcatenate($phinddir, "docs");
438
439    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
440    &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
441    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
442
443    my $parameters = "phindnumber=$self->{'phindnumber'}";
444    if (defined ($self->{'textorientation'})) {
445    $parameters .= ";textorientation=$self->{'textorientation'}";
446    }
447    # Return the information about the classifier that we'll later want to
448    # use to create macros when the Phind classifier document is displayed.
449    my %classifyinfo = ('thistype'=>'Invisible',
450                        'childtype'=>'Phind',
451                        'Title'=>$self->{'buttonname'},
452                        'parameters'=>$parameters,
453            'contains'=>[]);
454   
455    my $collection = $self->{'collection'};
456    my $url = "library?a=p&p=phind&c=$collection";
457    push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
458   
459    return \%classifyinfo;
460}
461
462
463
464sub convert_gml_to_tokens {
465   
466    my ($language_exp, $text) = @_;
467
468    # escape any magic words... - jrm21
469    foreach my $delim (@delimiters) {
470    my $replacement=lc($delim);
471    my $num= $text=~ s/$delim/$replacement/g;
472    if (!$num) {$num=0;}
473    }
474
475    if ($language_exp =~ /^en$/) {
476    return &convert_gml_to_tokens_EN($text);
477    }
478
479    if ($language_exp =~ /zh/) {
480    return &convert_gml_to_tokens_ZH($text);
481    } 
482   
483    $_ = $text;
484
485    # 1. remove GML tags
486
487    # Remove everything that is in a tag
488    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
489    s/\s*<br>\s*/ LINEBREAK /isgo;
490    s/<[^>]*>/ /sgo;
491
492    # Now we have the text, but it may contain HTML
493    # elements coded as &gt; etc.  Remove these tags.
494    s/&amp;/&/sgo;
495    s/&lt;/</sgo;
496    s/&gt;/>/sgo;
497    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
498    s/\s*<br>\s*/ LINEBREAK /isgo;
499    s/<[^>]*>/ /sgo;
500
501    # replace<p> and <br> placeholders with clause break symbol (\n)
502    s/\s+/ /gso;
503    s/PARAGRAPHBREAK/\n/sgo;
504    s/LINEBREAK/\n/sgo;
505
506   
507    # 2. Split the remaining text into space-delimited tokens
508
509    # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
510    s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
511
512    # Split text at word boundaries
513    s/\b/ /go;
514   
515    # 3. Convert the remaining text to "clause format"
516
517    # Insert newline if the end of a sentence is detected
518    # (delimter is:  "[\.\?\!]\s")
519    # s/\s*[\.\?\!]\s+/\n/go;
520
521    # remove unnecessary punctuation and replace with clause break symbol (\n)
522    # the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
523    #s/[^\w ]/\n/go;
524    # replace punct with new lines - is this what we want??
525    s/\s*[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s*/\n/go; #"
526    # then remove other punct with space
527    s/[\'\`\\\_]/ /go;
528
529    # remove extraneous whitespace
530    s/ +/ /sgo;
531    s/^\s+//mgo;
532    s/\s*$/\n/mgo;
533
534    # remove lines that contain one word or less
535    s/^\S*$//mgo;
536    s/^\s*$//mgo;
537    tr/\n//s;
538
539    return $_;
540}
541
542# a chinese version
543sub convert_gml_to_tokens_ZH {
544
545    $_ = shift @_;
546
547    # Replace all whitespace with a simple space
548    s/\s+/ /gs;
549    # Remove everything that is in a tag
550    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
551    s/\s*<br>\s*/ LINEBREAK /isg;
552    s/<[^>]*>/ /sg;
553
554    # Now we have the text, but it may contain HTML
555    # elements coded as &gt; etc.  Remove these tags.
556    s/&lt;/</sg;
557    s/&gt;/>/sg;
558
559    s/\s+/ /sg;
560    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
561    s/\s*<br>\s*/ LINEBREAK /isg;
562    s/<[^>]*>/ /sg;
563
564    # remove &amp; and other miscellaneous markup tags
565    s/&amp;/&/sg;
566    s/&lt;/</sg;
567    s/&gt;/>/sg;
568    s/&amp;/&/sg;
569
570    # replace<p> and <br> placeholders with carriage returns
571    s/PARAGRAPHBREAK/\n/sg;
572    s/LINEBREAK/\n/sg;
573
574   
575#    print STDERR "text:$_\n";
576    return $_;
577}
578
579# A version of convert_gml_to_tokens that is fine-tuned to the English language.
580
581sub convert_gml_to_tokens_EN {
582    $_ = shift @_;
583
584    # FIRST, remove GML tags
585
586    # Replace all whitespace with a simple space
587    s/\s+/ /gs;
588
589    # Remove everything that is in a tag
590    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
591    s/\s*<br>\s*/ LINEBREAK /isg;
592    s/<[^>]*>/ /sg;
593
594    # Now we have the text, but it may contain HTML
595    # elements coded as &gt; etc.  Remove these tags.
596    s/&lt;/</sg;
597    s/&gt;/>/sg;
598
599    s/\s+/ /sg;
600    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
601    s/\s*<br>\s*/ LINEBREAK /isg;
602    s/<[^>]*>/ /sg;
603
604    # remove &amp; and other miscellaneous markup tags
605    s/&amp;/&/sg;
606    s/&lt;/</sg;
607    s/&gt;/>/sg;
608    s/&amp;/&/sg;
609
610    # replace<p> and <br> placeholders with carriage returns
611    s/PARAGRAPHBREAK/\n/sg;
612    s/LINEBREAK/\n/sg;
613
614
615    # Exceptional punctuation
616    #
617    # We make special cases of some punctuation
618
619    # remove any apostrophe that indicates omitted letters
620    s/(\w+)\'(\w*\s)/ $1$2 /g;
621
622    # remove period that appears in a person's initals
623    s/\s([A-Z])\./ $1 /g;
624
625    # replace hyphens in hypheanted words and names with a space
626    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
627
628    # Convert the remaining text to "clause format",
629    # This means removing all excess punctuation and garbage text,
630    # normalising valid punctuation to fullstops and commas,
631    # then putting one cluse on each line.
632
633    # Insert newline when the end of a sentence is detected
634    # (delimter is:  "[\.\?\!]\s")
635    s/\s*[\.\?\!]\s+/\n/g;
636
637    # split numbers after four digits
638    s/(\d\d\d\d)/$1 /g;
639
640    # split words after 32 characters
641
642    # squash repeated punctuation
643    tr/A-Za-z0-9 //cs;
644
645    # save email addresses
646    # s/\w+@\w+\.[\w\.]+/EMAIL/g;
647
648    # normalise clause breaks (mostly punctuation symbols) to commas
649    s/[^A-Za-z0-9 \n]+/ , /g;
650
651    # Remove repeated commas, and replace with newline
652    s/\s*,[, ]+/\n/g;
653
654    # remove extra whitespace
655    s/ +/ /sg;
656    s/^\s+//mg;
657    s/\s*$/\n/mg;
658
659    # remove lines that contain one word or less
660    s/^\w*$//mg;
661    s/^\s*$//mg;
662    tr/\n//s;
663
664    return $_;
665
666}
667
668
669
670# Execute a system command
671
672sub execute {
673    my ($command, $verbosity, $outhandle) = @_;
674    print $outhandle "Executing: $command\n"  if ($verbosity > 2);
675    $! = 0;
676    my $status = system($command);
677    if ($status != 0) {
678    print STDERR "Phind - Error executing '$command': $!\n";
679    exit($status);  # this causes the build to fail...
680    }
681}
682
683
684# Generate the vocabulary, symbol statistics, and numbers file from the
685# clauses file.  This is legacy code, so is a bit messy and probably wont
686# run under windows.
687
688sub extract_vocabulary {
689    my ($self) = @_;
690   
691    my $verbosity = $self->{'verbosity'};
692    my $out = $self->{'outhandle'};
693
694    my $collectiondir = $self->{'collectiondir'};
695    my $phinddir = $self->{'phinddir'};
696
697    my $language_exp = $self->{'language'};
698
699    my ($w, $l, $line, $word);
700   
701    my ($first_delimiter, $last_delimiter,
702    $first_stopword, $last_stopword,
703    $first_extractword, $last_extractword,
704    $first_contentword, $last_contentword,
705    $phrasedelimiter);
706   
707    my $thesaurus = $self->{'thesaurus'};
708    my ($thesaurus_links, $thesaurus_terms,
709    %thesaurus, $first_thesaurusword, $last_thesaurusword);
710
711    my %symbol;
712    my (%freq);
713
714    print $out "Calculating vocabulary\n" if ($verbosity > 1);
715
716    # Read and store the stopwords
717    my $stopdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
718    my $stopword_files = ();
719    my ($language, $language_dir, $file, $file_name);
720    my %stopwords;
721
722    # Examine each directory in the stopword directory
723    opendir(STOPDIR, $stopdir);
724    foreach $language (readdir STOPDIR) {
725
726    # Ignore entries that do not match the classifier's language
727    next unless ($language =~ /$language_exp/);
728    $language_dir = &FileUtils::filenameConcatenate($stopdir, $language);
729    next unless (-d "$language_dir");
730
731    opendir(LANGDIR, $language_dir);
732    foreach $file (readdir LANGDIR) {
733
734        # Ignore entries that are not stopword files
735        next unless ($file =~ /sw$/);
736        $file_name = &FileUtils::filenameConcatenate($language_dir, $file);
737        next unless (-f "$file_name");
738
739        # Read the stopwords
740        open(STOPFILE, "<$file_name");
741        while (<STOPFILE>) {
742        s/^\s+//;
743        s/\s.*//;
744        $word = $_;
745        $l = lc($word);
746        $stopwords{$l} = $word;
747        }
748        close STOPFILE;
749
750    }
751    closedir LANGDIR;
752    }
753    closedir STOPDIR;
754
755    # Read thesaurus information
756    if ($thesaurus) {
757
758    # link file exists
759    $thesaurus_links = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.lnk");
760    die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
761
762    # ensure term file exists in the correct language
763    if ($language_exp =~ /^([a-z][a-z])/) {
764        $language = $1;
765    } else {
766        $language = 'en';
767    }
768    $thesaurus_terms = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.$language");
769    die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
770   
771
772    # Read the thesaurus terms
773    open(TH, "<$thesaurus_terms");
774    while(<TH>) {
775        s/^\d+ //;
776        s/\(.*\)//;
777        foreach $w (split(/\s+/, $_)) {
778        $thesaurus{lc($w)} = $w;
779        }
780    }
781    close TH;
782    }
783
784    # Read words in the text and count occurences
785    open(TXT, "<$phinddir/clauses");
786
787    my @words;
788    while(<TXT>) {
789    $line = $_;
790    next unless ($line =~ /./);
791   
792    @words = split(/\s+/, $line);
793    foreach $w (@words) {
794        $l = lc($w);
795        $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
796        $freq{$w}++;
797    }
798    $freq{$senlimit}++;
799    }
800   
801    close TXT;
802
803    # Calculate the "best" form of each word
804    my (%bestform, %totalfreq, %bestfreq);
805
806    foreach $w (sort (keys %freq)) {
807    $l = lc($w);
808   
809    # totalfreq is the number of times a term appears in any form
810    $totalfreq{$l} += $freq{$w};
811   
812    if (defined $stopwords{$l}) {
813        $bestform{$l} = $stopwords{$l};
814       
815    } elsif (defined $thesaurus{$l}) {
816        $bestform{$l} = $thesaurus{$l};
817       
818    } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
819        $bestfreq{$l} = $freq{$w};
820        $bestform{$l} = $w;
821    }
822    }
823    undef %freq;
824    undef %bestfreq;
825   
826
827    # Assign symbol numbers to tokens
828    my $nextsymbol = 1;
829    my (@vocab);
830   
831    # Delimiters
832    $first_delimiter = 1;
833   
834    foreach $word (@delimiters) {
835
836#   $word = lc($word); # jrm21
837    $word = uc($word);
838    $bestform{$word} = $word;
839    $vocab[$nextsymbol] = $word;
840    $symbol{$word} = $nextsymbol;
841    $nextsymbol++;
842    }
843    $last_delimiter = $nextsymbol - 1;
844    # Stopwords
845    $first_stopword = $nextsymbol;
846   
847    foreach my $word (sort keys %stopwords) {
848    # don't include stopword unless it occurs in the text
849    $word = lc($word);
850    next unless ($totalfreq{$word});
851    next if ($symbol{$word});
852   
853    $vocab[$nextsymbol] = $word;
854    $symbol{$word} = $nextsymbol;
855    $nextsymbol++;
856    }
857    $last_stopword = $nextsymbol - 1;
858    $first_contentword = $nextsymbol;
859   
860    # Thesaurus terms
861    if ($thesaurus) {
862    $first_thesaurusword = $nextsymbol;
863   
864    foreach my $word (sort keys %thesaurus) {
865       
866        $word = lc($word);
867        next if ($symbol{$word});
868        $bestform{$word} = $thesaurus{$word};
869       
870        $vocab[$nextsymbol] = $word;
871        $symbol{$word} = $nextsymbol;
872        $nextsymbol++;
873       
874    }
875    $last_thesaurusword = $nextsymbol - 1;
876    }
877
878    # Other content words
879    $first_extractword = $nextsymbol;
880   
881    foreach my $word (sort (keys %bestform)) {
882   
883    next if ($symbol{$word});
884   
885    $vocab[$nextsymbol] = $word;
886    $symbol{$word} = $nextsymbol;
887    $nextsymbol++;
888    }
889    $last_extractword = $nextsymbol - 1;
890    $last_contentword = $nextsymbol - 1;
891   
892    # Outut the words
893    print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
894    open(VOC, ">$phinddir/clauses.vocab");
895
896    for (my $i = 1; $i < $nextsymbol; $i++) {
897    $w = $vocab[$i];
898
899    print VOC "$bestform{$w}\n";
900    $totalfreq{$w} = 0 unless ($totalfreq{$w});
901    }
902    close VOC;
903
904
905    # Create statistics file
906    # Output statistics about the vocablary
907    print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
908    &FileUtils::removeFiles("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
909
910    open(STAT, ">$phinddir/clauses.stats")
911    || die "Cannot open $phinddir/clauses.stats: $!";
912
913    print STAT "first_delimiter $first_delimiter\n";
914    print STAT "last_delimiter $last_delimiter\n";
915    print STAT "first_stopword $first_stopword\n";
916    print STAT "last_stopword $last_stopword\n";
917    if ($thesaurus) {
918    print STAT "first_thesaurusword $first_thesaurusword\n";
919    print STAT "last_thesaurusword $last_thesaurusword\n";
920    }
921    print STAT "first_extractword $first_extractword\n";
922    print STAT "last_extractword $last_extractword\n";
923    print STAT "first_contentword $first_contentword\n";
924    print STAT "last_contentword $last_contentword\n";
925    print STAT "first_symbol $first_delimiter\n";
926    print STAT "last_symbol $last_contentword\n";
927    print STAT "first_word $first_stopword\n";
928    print STAT "last_word $last_contentword\n";
929    close STAT;
930
931    undef @vocab;
932
933
934    # Create numbers file
935    # Save text as symbol numbers
936    print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
937   
938    open(TXT, "<$phinddir/clauses");
939    open(NUM, ">$phinddir/clauses.numbers");
940   
941##    $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
942##    print NUM "$symbol{lc($colstart)}\n"; # jrm21
943    $phrasedelimiter = $symbol{$senlimit};
944    print NUM "$symbol{$colstart}\n";
945   
946    # set up the special symbols that delimit documents and sentences
947    while(<TXT>) {
948   
949    # split sentence into a list of tokens
950    $line = $_;
951    next unless ($line =~ /./);
952    @words = split(/\s+/, $line);
953   
954    # output one token at a time
955    foreach $word (@words) {
956# don't lower-case special delimiters - jrm21
957        if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
958        $word = lc($word);
959        }
960        print NUM "$symbol{$word}\n";
961    }
962   
963    # output phrase delimiter
964    print NUM "$phrasedelimiter\n";
965    }
966   
967    close TXT;
968#    print NUM "$symbol{lc($colend)}\n";# jrm21
969    print NUM "$symbol{$colend}\n";
970    close NUM;
971
972    # Save thesaurus  data in one convienient file
973    if ($thesaurus) {
974
975    my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
976
977
978    print $out "Saving thesaurus as numbers in $thesaurusfile\n"
979        if ($verbosity > 1);
980
981    # Read the thesaurus terms
982    my ($num, $text, %thes_symbols);
983   
984    open(TH, "<$thesaurus_terms");
985    while(<TH>) {
986        chomp;
987        @words = split(/\s+/, $_);
988        $num = shift @words;
989        $text = "";
990
991        # translate words into symbol numbers
992        foreach $word (@words) {
993        $word = lc($word);
994        if ($symbol{$word}) {
995            $text .= "s$symbol{$word} ";
996        } elsif ($verbosity) {
997            print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
998        }
999        }
1000        $text =~ s/ $//;
1001        $thes_symbols{$num} = $text;
1002    }
1003    close TH;
1004
1005    # Read the thesaurus links and write the corresponding data
1006    open(TH, "<$thesaurus_links");
1007    open(THOUT, ">$thesaurusfile");
1008
1009    while(<TH>) {
1010        chomp;
1011        ($num, $text) = split(/:/, $_);
1012
1013        if (defined($thes_symbols{$num})) {
1014        print THOUT "$num:$thes_symbols{$num}:$text\n";
1015        } else {
1016        print THOUT "$num:untranslated:$text\n";
1017        }
1018    }
1019    close TH;   
1020    close THOUT;
1021    }
1022
1023
1024
1025
1026}
1027
1028
1029# renumber_phrases
1030#
1031# Prepare the phrases file to be input to mgpp.  The biggest problem is
1032# reconciling the phrase identifiers used by the suffix program (which
1033# we'll call suffix-id numbers) with the numbers used in the thesaurus
1034# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1035# Phind-id numbers must be sorted by frequency of occurance.
1036#
1037# Start creating a set of phind-id numbers from the sorted suffix-id
1038# numbers and (if required) the thesaurus-id numbers.  Then add any other
1039# phrases occuring in the thesaurus.
1040#
1041# The last thing we have to do is restore the vocabulary information to the
1042# phrase file so that the phrases are stored as words, not as symbol
1043# numbers.
1044
1045# The original phrases file looks something like this:
1046#   159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1047#   159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1048#   159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1049#   159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1050
1051
1052sub renumber_phrases {
1053    my ($self) = @_;
1054
1055    renumber_suffix_data($self);
1056    renumber_thesaurus_data($self);
1057    restore_vocabulary_data($self);
1058
1059}
1060
1061
1062
1063# renumber_suffix_data
1064#
1065# Translate phrases file to phrases.2 using phind keys instead
1066# of suffix keys and sorting the expansion data.
1067
1068sub renumber_suffix_data {
1069    my ($self) = @_;
1070   
1071    my $verbosity = $self->{'verbosity'};
1072    my $out = $self->{'outhandle'};
1073    print $out "Translate phrases: suffix-ids become phind-id's\n"
1074    if ($verbosity);
1075   
1076    my $phinddir = $self->{'phinddir'};
1077    my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases');
1078    my $outfile = &FileUtils::filenameConcatenate($phinddir, 'phrases.2');
1079
1080    # Read the phrase file.  Calculate initial set of phind-id
1081    # numbers and store (suffixid -> frequency) relation.
1082
1083    my %suffixtophind;
1084    my @totalfrequency;
1085    my (@fields, $suffixid);
1086    my $nextphind = 1;
1087   
1088    open(IN, "<$infile");
1089    while(<IN>) {
1090
1091    chomp;
1092    @fields = split(/:/, $_);
1093   
1094    # get next suffixid and phindid
1095    $suffixid = shift @fields;
1096    $suffixtophind{$suffixid} = $nextphind;
1097
1098    # store total frequency
1099    shift @fields;
1100    $totalfrequency[$nextphind] = shift @fields;
1101
1102    $nextphind++;
1103    }
1104    close IN;
1105
1106
1107    # Translate phrases file to phrases.2.  Use phind keys (not suffix
1108    # keys), sort expansion and document occurance data in order of
1109    # descending frequency..
1110    open(IN, "<$infile");
1111    open(OUT, ">$outfile");
1112   
1113    my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1114    my (@documwents, @newexp, $k, $n);
1115    my $linenumber = 0;
1116
1117    while(<IN>) {
1118   
1119    # read the line
1120    chomp;
1121    @fields = split(/:/, $_);
1122   
1123    # get a phrase number for this line
1124    $suffixid = shift @fields;
1125    die unless (defined($suffixtophind{$suffixid}));
1126    $phindid = $suffixtophind{$suffixid};
1127   
1128    # get the symbols in the phrase
1129    $text = shift @fields;
1130
1131    # output status information
1132    $linenumber++;
1133    if ($verbosity > 2) {
1134        if ($linenumber % 1000 == 0) {
1135        print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1136        }
1137        # what are $num and $key??
1138        #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1139    }
1140
1141    # get the phrase frequency
1142    $tf = shift @fields;
1143   
1144    # get the number of expansions
1145    $countexp = shift @fields;
1146   
1147    # get the expansions, convert them into phind-id numbers, and sort them
1148    $expansions = shift @fields;
1149    @newexp = ();
1150    foreach $k (split(/,/, $expansions)) {
1151        die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1152        $n = $suffixtophind{$k};
1153        push @newexp, $n;
1154    }
1155    @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1156
1157    # get the number of documents
1158    $countdocs = shift @fields;
1159   
1160    # get the documents and sort them
1161    $documents = shift @fields;
1162    $documents =~ s/d//g;
1163    my @documents = split(/;/, $documents);
1164    @documents = sort by_doc_frequency @documents;
1165
1166    # output the phrase data
1167    print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1168    print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1169   
1170    }
1171
1172    close IN;
1173    close OUT;
1174}
1175
1176
1177# renumber_thesaurus_data
1178#
1179# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1180
1181sub renumber_thesaurus_data {
1182    my ($self) = @_;
1183 
1184    my $out = $self->{'outhandle'};
1185    my $verbosity = $self->{'verbosity'};
1186    my $thesaurus = $self->{'thesaurus'};
1187
1188    my $phinddir = $self->{'phinddir'};
1189    my $infile = &FileUtils::filenameConcatenate($phinddir, "phrases.2");
1190    my $outfile = &FileUtils::filenameConcatenate($phinddir, "phrases.3");
1191
1192
1193    # If no thesaurus is defined, simply move the phrases file.
1194    if (!$thesaurus) {
1195    print $out "Translate phrases.2: no thesaurus data\n"
1196        if ($verbosity);
1197    &FileUtils::moveFiles($infile, $outfile);
1198    return;
1199    }
1200
1201    print $out "Translate phrases.2: add thesaurus data\n"
1202    if ($verbosity);
1203
1204    # 1.
1205    # Read thesaurus file and store (symbols->thesaurusid) mapping
1206    my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
1207    my %symbolstothesid;
1208    my (@fields, $thesid, $symbols);
1209   
1210    open(TH, "<$thesaurusfile");
1211
1212    while (<TH>) {
1213   
1214    chomp;
1215    @fields = split(/:/, $_);
1216   
1217    # get id and text
1218    $thesid = shift @fields;
1219    $symbols = shift @fields;
1220    $symbolstothesid{$symbols} = $thesid;
1221    }
1222    close TH;   
1223
1224    # 2.
1225    # Read phrases file to find thesaurus entries that already
1226    # have a phindid.  Store their phind-ids for later translation,
1227    # and store their frequency for later sorting.
1228    my %thesaurustophindid;
1229    my %phindidtofrequency;
1230    my ($phindid, $freq);
1231
1232    open(IN, "<$infile");
1233   
1234    while(<IN>) {
1235   
1236    chomp;
1237    @fields = split(/:/, $_);
1238   
1239    # phindid and symbols for this line
1240    $phindid = shift @fields;
1241    $symbols = shift @fields;
1242    $freq = shift @fields;
1243
1244    # do we have a thesaurus id corresponding to this phrase?
1245    if (defined($symbolstothesid{$symbols})) {
1246        $thesid = $symbolstothesid{$symbols};
1247        $thesaurustophindid{$thesid} = $phindid;
1248        $phindidtofrequency{$phindid} = $freq;
1249    }
1250    }
1251    close IN;
1252 
1253    undef %symbolstothesid;
1254
1255    # 3.
1256    # Create phind-id numbers for remaining thesaurus entries,
1257    # and note that their frequency is 0 for later sorting.
1258    my $nextphindid = $phindid + 1;
1259
1260    open(TH, "<$thesaurusfile");
1261    while(<TH>) {
1262   
1263    chomp;
1264    @fields = split(/:/, $_);
1265   
1266    # read thesaurus-id and ensure it has a corresponding phind-id
1267    $thesid = shift @fields;
1268    if (!defined($thesaurustophindid{$thesid})) {
1269        $thesaurustophindid{$thesid} = $nextphindid;
1270        $phindidtofrequency{$nextphindid} = 0;
1271        $nextphindid++;
1272    }
1273    }
1274    close TH;
1275
1276    # 4.
1277    # Translate thesaurus file, replacing thesaurus-id numbers with
1278    # phind-id numbers.
1279    my $newthesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.phindid");
1280    my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1281    my (@links, $linkid, %linkidtotype, $newrelation);
1282
1283    open(TH, "<$thesaurusfile");
1284    open(TO, ">$newthesaurusfile");
1285    while(<TH>) {
1286   
1287    chomp;
1288    @fields = split(/:/, $_);
1289   
1290    # phindid and symbols for this line
1291    ($thesid, $symbols, $relations) = @fields;
1292   
1293    die unless ($thesid && $symbols);
1294    die unless $thesaurustophindid{$thesid};
1295    $phindid = $thesaurustophindid{$thesid};
1296
1297    # convert each part of the relation string to use phind-id numbers
1298    # at the same time, we want to sort the list by frequency.
1299    undef %linkidtotype;
1300   
1301    foreach $linktext (split(/;/, $relations)) {
1302        @linkdata = split(/,/, $linktext);
1303       
1304        # remember the linktype (e.g. BT, NT)
1305        $linktype = shift @linkdata;
1306       
1307        # store the type of each link
1308        foreach $thesid (@linkdata) {
1309        die unless (defined($thesaurustophindid{$thesid}));
1310        $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1311        }
1312    }
1313
1314    # sort the list of links, first by frequency, then by type.
1315    @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1316                        or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1317    $linkcounter = (scalar @links);
1318
1319    # create a string describing the link information
1320    $linktype = $linkidtotype{$links[0]};
1321    $newrelation = $linktype;
1322    foreach $linkid (@links) {
1323        if ($linkidtotype{$linkid} ne $linktype) {
1324        $linktype = $linkidtotype{$linkid};
1325        $newrelation .= ";" . $linktype;
1326        }
1327        $newrelation .= "," . $linkid;
1328    }
1329    $newrelation .= ";";
1330   
1331
1332    # output the new line
1333    print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1334    }
1335    close TH;
1336    close TO;
1337
1338    undef %thesaurustophindid;
1339    undef %linkidtotype;
1340    undef %phindidtofrequency;
1341
1342    # 5.
1343    # Read thesaurus data (in phind-id format) into memory
1344    my %thesaurusdata;
1345
1346    open(TH, "<$newthesaurusfile");
1347    while(<TH>) {
1348    chomp;
1349    ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1350    die unless ($phindid && $symbols);
1351    $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1352    }
1353    close TH;
1354
1355    # 6.
1356    # Add thesaurus data to phrases file
1357    my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1358    my (@documwents, @newexp, $k, $n);
1359    my $linenumber = 0;
1360
1361    open(IN, "<$infile");
1362    open(OUT, ">$outfile");
1363
1364    # Update existing phrases
1365    while(<IN>) {
1366   
1367    chomp;
1368    @fields = split(/:/, $_);
1369       
1370    # get data for this line
1371    $phindid = shift @fields;
1372   
1373    # output the phrase data, with thesaurus information
1374    print OUT "$phindid:", join(":", @fields);
1375
1376    # add thesaurus data
1377    if (defined($thesaurusdata{$phindid})) {
1378        @fields = split(/:/, $thesaurusdata{$phindid});
1379        shift @fields;
1380        $linkcounter = shift @fields;
1381        $relations = shift @fields;
1382
1383        print OUT ":$linkcounter:$relations";
1384        $thesaurusdata{$phindid} = "";
1385    }
1386    print OUT "\n";
1387    }
1388    close IN;
1389
1390    # Add phrases that aren't already in the file
1391    foreach $phindid (sort numerically keys %thesaurusdata) {
1392    next unless ($thesaurusdata{$phindid});
1393
1394    @fields = split(/:/, $thesaurusdata{$phindid});
1395    $symbols = shift @fields;
1396    $linkcounter = shift @fields;
1397    $relations = shift @fields;
1398
1399    print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1400    }
1401    close OUT;
1402
1403}
1404
1405# restore_vocabulary_data
1406#
1407# Read phrases.3 and restore vocabulary information. Then write
1408# this data to the MGPP input files (pword.txt and pdata.txt) and
1409# (if requested) to the saved phrases file.
1410
1411sub restore_vocabulary_data {
1412    my ($self) = @_;
1413 
1414    my $out = $self->{'outhandle'};
1415    my $verbosity = $self->{'verbosity'};
1416    print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1417
1418    my $phinddir = $self->{'phinddir'};
1419    my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases.3');
1420    my $vocabfile = &FileUtils::filenameConcatenate($phinddir, 'clauses.vocab');
1421    my $datafile = &FileUtils::filenameConcatenate($phinddir, 'pdata.txt');
1422    my $wordfile = &FileUtils::filenameConcatenate($phinddir, 'pword.txt');
1423
1424    my $savephrases = $self->{'savephrases'};
1425
1426    # 1.
1427    # Read the vocabulary file
1428    open(V, "<$vocabfile")
1429    || die "Cannot open $vocabfile: $!";
1430    my @symbol;
1431    my $i = 1;
1432    while(<V>) {
1433    chomp;
1434    $symbol[$i++] = $_;
1435    }
1436    close V;
1437
1438    # 2.
1439    # Translate phrases.3 to MGPP input files
1440    my ($key, $text, $word, $isThesaurus, $line);
1441    my @fields;
1442    my $linenumber = 0;
1443
1444    open(IN, "<$infile");
1445    open(DATA, ">$datafile");
1446    open(WORD, ">$wordfile");
1447
1448    # Save the phrases in a separate text file
1449    if ($savephrases) {
1450    print $out "Saving phrases in $savephrases\n" if ($verbosity);
1451    open(SAVE, ">$savephrases");
1452    }
1453
1454    while(<IN>) {
1455   
1456    # read the line
1457    chomp;
1458    $line = $_;
1459    @fields = split(/:/, $line);
1460   
1461    # get a phrase number for this line
1462    $key = shift @fields;
1463   
1464    # restore the text of the phrase
1465    $text = shift @fields;
1466    $text =~ s/s(\d+)/$symbol[$1]/g;
1467    if ($text =~ / /) {
1468        $word = "";
1469    } elsif ($text ne 'untranslated') {
1470        $word = $text;
1471    }
1472
1473    # output the phrase data
1474    print DATA "<Document>";
1475    print DATA "$key:$text:", join(":", @fields), ":\n";
1476   
1477    # output the word index search data
1478    print WORD "<Document>$word\n";
1479
1480    # output the phrases to a text file
1481    if ($savephrases) {
1482        if ((scalar @fields) == 7) {
1483        $isThesaurus = 1;
1484        } else {
1485        $isThesaurus = 0;
1486        }
1487        print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1488    }
1489    }
1490    close IN;
1491    close WORD;
1492    close DATA;
1493    close SAVE if ($savephrases);
1494
1495}
1496
1497
1498
1499# sort routines used to renumber phrases
1500
1501sub numerically { $a <=> $b }
1502
1503sub by_doc_frequency {
1504    my $fa = 1;
1505    if ($a =~ /,/) {
1506    $fa = $a;
1507    $fa =~ s/\d+,//;
1508    }
1509    my $fb = 1;
1510    if ($b =~ /,/) {
1511    $fb = $b;
1512    $fb =~ s/\d+,//;
1513    }
1514
1515    return ($fb <=> $fa);
1516}
1517
15181;
Note: See TracBrowser for help on using the browser.