source: trunk/gsdl/perllib/classify/Phind.pm@ 6408

Last change on this file since 6408 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 41.9 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BasClas;
37use util;
38use ghtml;
39use unicode;
40
41my @removedirs = ();
42
43my %wanted_index_files = ('td'=>1,
44 't'=>1,
45 'ti'=>1,
46 'tl'=>1,
47 'tsd'=>1,
48 'idb'=>1,
49 'ib1'=>1,
50 'ib2'=>1,
51 'ib3'=>1,
52 'i'=>1,
53 'il'=>1,
54 'w'=>1,
55 'wa'=>1);
56
57sub BEGIN {
58 @ISA = ('BasClas');
59}
60
61sub END {
62
63 # Tidy up stray files - we do this here as there's some weird problem
64 # preventing us from doing it in the get_classify_info() function (on
65 # windows at least) where the close() appears to fail on txthandle and
66 # dochandle, thus preventing us from deleting those files
67
68 foreach my $dir (@removedirs) {
69 if (-d $dir && opendir (DIR, $dir)) {
70 my @files = readdir DIR;
71 closedir DIR;
72
73 foreach $file (@files) {
74 next if $file =~ /^\.\.?$/;
75 my ($suffix) = $file =~ /\.([^\.]+)$/;
76 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
77 # delete it!
78 &util::rm (&util::filename_cat ($dir, $file));
79 }
80 }
81 }
82 }
83}
84
85my $arguments =
86 [ { 'name' => "text",
87 'desc' => "{Phind.text}",
88 'type' => "string",
89 'deft' => "section:Title,section:text",
90 'reqd' => "no" },
91 { 'name' => "title",
92 'desc' => "{Phind.title}",
93 'type' => "metadata",
94 'deft' => "Title",
95 'reqd' => "no" },
96 { 'name' => "button",
97 'desc' => "{Phind.button}",
98 'type' => "string",
99 'deft' => "Phrase",
100 'reqd' => "no" },
101 { 'name' => "language",
102 'desc' => "{Phind.language}",
103 'type' => "language",
104 'deft' => "en",
105 'reqd' => "no" },
106 { 'name' => "savephrases",
107 'desc' => "{Phind.savephrases}",
108 'type' => "string",
109 'deft' => "",
110 'reqd' => "no" },
111 { 'name' => "builddir",
112 'desc' => "{BasClas.builddir}",
113 'type' => "string",
114 'deft' => "" },
115 { 'name' => "suffixmode",
116 'desc' => "{Phind.suffixmode}",
117 'type' => "int",
118 'deft' => "1",
119 'reqd' => "no" },
120 { 'name' => "thesaurus",
121 'desc' => "{Phind.thesaurus}",
122 'type' => "string",
123 'deft' => "",
124 'reqd' => "no" },
125 { 'name' => "untidy",
126 'desc' => "{Phind.untidy}",
127 'type' => "flag",
128 'reqd' => "no" } ];
129
130my $options = { 'name' => "Phind",
131 'desc' => "{Phind.desc}",
132 'abstract' => "no",
133 'inherits' => "yes",
134 'args' => $arguments };
135
136# sub print_usage {
137# print STDERR "
138# usage: classify Phind [options]
139
140# options:
141# -text Fields The text used to build the phrase hierarchy.
142# (default: 'section:Title,section:text')
143
144# -title Title The metadata field used to describe each document.
145# (default: 'Title')
146
147# -button Name The label for the classifier screen and button in
148# navigation bar.
149# (default: 'Phrase')
150
151# -language Regex Language or languages to use building hierarchy.
152# Languages are identified by two-letter country codes
153# like en (English), es (Spanish), and fr (French).
154# Language is a regular expression, so 'en|fr' (English or
155# French) and '..' (match any language) are valid.
156# (default: 'en'.)
157
158# -savephrases File If set, the phrase infomation will be stored in
159# the given file as text. It is probably a good idea
160# to use an absolute path.
161# (default: not set)
162
163# -suffixmode N The smode parameter to the phrase extraction program. A
164# value of 0 means that stopwords are ignored, and of 1
165# means that stopwords are used.
166# (default: 1)
167
168# -thesaurus Name Name of a thesaurus stored in Phind format in the
169# collection's etc directory.
170# (default: not set)
171
172# -untidy Don't remove working files.
173
174# "; }
175
176# Phrase delimiter symbols - these should be abstracted out someplace
177
178my $colstart = "COLLECTIONSTART";
179my $colend = "COLLECTIONEND";
180my $doclimit = "DOCUMENTLIMIT";
181my $senlimit = "SENTENCELIMIT";
182my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
183
184
185# Create a new Phind browser based on collect.cfg
186
187sub new {
188 my $class = shift (@_);
189 my $self = new BasClas($class, @_);
190
191 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
192 my $option_list = $self->{'option_list'};
193 push( @{$option_list}, $options );
194
195 my $out = $self->{'outhandle'};
196
197 # Ensure the Phind generate scripts are in place
198 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
199 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
200 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
201
202 if (!(-e $file1)) {
203 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
204 exit(1);
205 }
206
207 # Parse classifier arguments
208 my $builddir = "";
209 if (!parsargv::parse(\@_,
210 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
211 q^title/.*/Title^, \$self->{'titlefield'},
212 q^button/.*/Phrase^, \$self->{'buttonname'},
213 q^language/.*/en^, \$language,
214 q^builddir/.*/^, \$builddir,
215 q^savephrases/.*/^, \$self->{'savephrases'},
216 q^suffixmode/\d/1^, \$self->{'suffixmode'},
217 q^thesaurus/.*/^, \$self->{'thesaurus'},
218 q^untidy^, \$self->{'untidy'},
219 "allow_extra_options")) {
220
221 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
222 $self->print_txt_usage(""); # Use default resource bundle
223 die "\n";
224 }
225
226 # classifier information
227 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
228
229 # limit languages
230 $self->{'language_exp'} = $language;
231
232 # collection directories
233 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
234 if (!$builddir) {
235 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
236 }
237 $self->{'builddir'} = $builddir;
238
239 $self->{'total'} = 0;
240
241 return bless $self, $class;
242}
243
244
245# Initialise the Phind classifier
246
247sub init {
248 my $self = shift (@_);
249
250 # ensure we have a build directory
251 my $builddir = $self->{'builddir'};
252 die unless (-e "$builddir");
253
254 # create Phind directory
255 my $phnumber = 1;
256 my $phinddir = &util::filename_cat($builddir, "phind1");
257 while (-e "$phinddir") {
258 $phnumber++;
259 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
260 }
261 &util::mk_dir("$phinddir");
262 $self->{'phinddir'} = $phinddir;
263 $self->{'phindnumber'} = $phnumber;
264
265 push(@removedirs, $phinddir) unless $self->{'untidy'};
266
267 # open filehandles for documents and text
268 my $clausefile = &util::filename_cat("$phinddir", "clauses");
269 &util::rm($clausefile) if (-e $clausefile);
270
271 my $txthandle = 'TEXT' . $phnumber;
272 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
273 $self->{'txthandle'} = $txthandle;
274
275 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
276 &util::rm($docfile) if (-e $docfile);
277
278 my $dochandle = 'DOC' . $phnumber;
279 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
280 $self->{'dochandle'} = $dochandle;
281
282}
283
284
285# Classify each document.
286#
287# Each document is passed here in turn. The classifier extracts the
288# text of each and stores it in the clauses file. Document details are
289# stored in the docs.txt file.
290
291sub classify {
292 my $self = shift (@_);
293 my $doc_obj = shift @_;
294
295 my $verbosity = $self->{'verbosity'};
296 my $top_section = $doc_obj->get_top_section();
297
298 my $titlefield = $self->{'titlefield'};
299
300 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
301 if (!defined($title)) {
302 $title = "";
303 print STDERR "Phind: document has no title\n";
304 }
305 print "process: $title\n" if ($verbosity > 2);
306
307 # Only consider the file if it is in the correct language
308 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
309 my $phrlanguage = $self->{'language_exp'};
310 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
311
312 # record this file
313 $self->{'total'} ++;
314 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
315
316
317 # Store document details
318 my $OID = $doc_obj->get_OID();
319 $OID = "NULL" unless defined $OID;
320 my $dochandle = $self->{'dochandle'};
321 print $dochandle "<Document>\t$OID\t$title\n";
322
323 # Store the text occuring in this object
324
325 # output the document delimiter
326 my $txthandle = $self->{'txthandle'};
327 print $txthandle "$doclimit\n";
328
329 # iterarate over the required indexes and store their text
330 my $indexes = $self->{'indexes'};
331 my $text = "";
332 my ($part, $level, $field, $section, $data, $dataref);
333
334 foreach $part (split(/,/, $indexes)) {
335
336 # Each field has a level and a data element ((e.g. document:Title)
337 ($level, $field) = split(/:/, $part);
338 die unless ($level && $field);
339
340 # Extract the text from every section
341 # (In phind, document:text and section:text are equivalent)
342 if ($field eq "text") {
343 $data = "";
344 $section = $doc_obj->get_top_section();
345 while (defined($section)) {
346 $data .= $doc_obj->get_text($section) . "\n";
347 $section = $doc_obj->get_next_section($section);
348 }
349 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
350 }
351
352 # Extract a metadata field from a document
353 # (If there is more than one element of the given type, get them all.)
354 elsif ($level eq "document") {
355 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
356 foreach $data (@$dataref) {
357 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
358 }
359 }
360
361 # Extract metadata from every section in a document
362 elsif ($level eq "section") {
363 $data = "";
364 $section = $doc_obj->get_top_section();
365 while (defined($section)) {
366 $dataref = $doc_obj->get_metadata($section, $field);
367 $data .= join("\n", @$dataref) . "\n";
368 $section = $doc_obj->get_next_section($section);
369 }
370 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
371 }
372
373 # Some sort of specification which I don't understand
374 else {
375 die "Unknown level ($level) in Phind index ($part)\n";
376 }
377
378 }
379
380 # output the text
381 $text =~ tr/\n//s;
382 print $txthandle "$text";
383}
384
385
386# Construct the classifier from the information already gathered
387#
388# When get_classify_info is called, the clauses and docs.txt files have
389# already been constructed in the Phind directory. This function will
390# translate them into compressed, indexed MGPP files that can be read by
391# the phindcgi script. It will also register our classifier so that it
392# shows up in the navigation bar.
393
394sub get_classify_info {
395 my $self = shift (@_);
396 my ($gli) = @_;
397
398 close $self->{'dochandle'};
399 close $self->{'txthandle'};
400 my $verbosity = $self->{'verbosity'};
401 my $out = $self->{'outhandle'};
402 my $phinddir = $self->{'phinddir'};
403
404 my $osextra = "";
405 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
406 $osextra = " -d /";
407 }
408
409 print STDERR "</Stage>\n" if $gli;
410
411 if ($verbosity) {
412 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
413 print $out "*** in ", $self->{'phinddir'}, "\n";
414 }
415
416 print STDERR "<Stage name='Phind'>\n" if $gli;
417
418 # Construct phind indexes
419 my $suffixmode = $self->{'suffixmode'};
420 my ($command, $status);
421
422 # Generate the vocabulary, symbol statistics, and numbers file
423 # from the clauses file
424 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
425 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
426 &extract_vocabulary($self);
427
428 # Use the suffix program to generate the phind/phrases file
429 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
430 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
431 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
432
433 # check that we generated some files. It's not necessarily an error if
434 # we didn't (execute() would have quit on error), but we can't go on.
435 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
436 if (! -r $phrasesfile) {
437 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
438 print $out "\nNo phrases found for Phind classifier!\n";
439 return;
440 }
441
442 # Create the phrase file and put phrase numbers in phind/phrases
443 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
444 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
445 &renumber_phrases($self);
446
447 print $out "\nCreating phrase databases\n";
448 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
449 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
450 my $mg_stem = &util::filename_cat($phinddir, "pdata");
451
452 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
453 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
454 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
455
456 # create the mg index of words
457 print $out "\nCreating word-level search indexes\n";
458 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
459 $mg_input = &util::filename_cat($phinddir, "pword.txt");
460 $mg_stem = &util::filename_cat($phinddir, "pword");
461
462 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
463 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
464 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
465 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
466 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
467 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
468
469 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
470 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
471 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
472
473 # create the mg document information database
474 print $out "\nCreating document information databases\n";
475 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
476 $mg_input = &util::filename_cat($phinddir, "docs.txt");
477 $mg_stem = &util::filename_cat($phinddir, "docs");
478
479 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
480 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
481 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
482
483 # Return the information about the classifier that we'll later want to
484 # use to create macros when the Phind classifier document is displayed.
485 my %classifyinfo = ('thistype'=>'Invisible',
486 'childtype'=>'Phind',
487 'Title'=>$self->{'buttonname'},
488 'parameters'=>"phindnumber=$self->{'phindnumber'}",
489 'contains'=>[]);
490
491 my $collection = $self->{'collection'};
492 my $url = "library?a=p&p=phind&c=$collection";
493 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
494
495 return \%classifyinfo;
496}
497
498
499
500sub convert_gml_to_tokens {
501
502 my ($language_exp, $text) = @_;
503
504 # escape any magic words... - jrm21
505 foreach my $delim (@delimiters) {
506 my $replacement=lc($delim);
507 my $num= $text=~ s/$delim/$replacement/g;
508 if (!$num) {$num=0;}
509 }
510
511 if ($language_exp =~ /en/) {
512 return &convert_gml_to_tokens_EN($text);
513 }
514
515 if ($language_exp =~ /zh/) {
516 return &convert_gml_to_tokens_ZH($text);
517 }
518
519 $_ = $text;
520
521 # 1. remove GML tags
522
523 # Remove everything that is in a tag
524 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
525 s/\s*<br>\s*/ LINEBREAK /isgo;
526 s/<[^>]*>/ /sgo;
527
528 # Now we have the text, but it may contain HTML
529 # elements coded as &gt; etc. Remove these tags.
530 s/&amp;/&/sgo;
531 s/&lt;/</sgo;
532 s/&gt;/>/sgo;
533 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
534 s/\s*<br>\s*/ LINEBREAK /isgo;
535 s/<[^>]*>/ /sgo;
536
537 # replace<p> and <br> placeholders with clause break symbol (\n)
538 s/\s+/ /gso;
539 s/PARAGRAPHBREAK/\n/sgo;
540 s/LINEBREAK/\n/sgo;
541
542
543
544
545 # 2. Split the remaining text into space-delimited tokens
546
547 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
548 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
549
550 # Split text at word boundaries
551 s/\b/ /go;
552
553 # 3. Convert the remaining text to "clause format"
554
555 # Insert newline if the end of a sentence is detected
556 # (delimter is: "[\.\?\!]\s")
557 # s/\s*[\.\?\!]\s+/\n/go;
558
559 # remove unnecessary punctuation and replace with clause break symbol (\n)
560 s/[^\w ]/\n/go;
561
562 # remove extraneous whitespace
563 s/ +/ /sgo;
564 s/^\s+//mgo;
565 s/\s*$/\n/mgo;
566
567 # remove lines that contain one word or less
568 s/^\S*$//mgo;
569 s/^\s*$//mgo;
570 tr/\n//s;
571
572 return $_;
573}
574
575# a chinese version
576sub convert_gml_to_tokens_ZH {
577
578 $_ = shift @_;
579
580 # Replace all whitespace with a simple space
581 s/\s+/ /gs;
582 # Remove everything that is in a tag
583 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
584 s/\s*<br>\s*/ LINEBREAK /isg;
585 s/<[^>]*>/ /sg;
586
587 # Now we have the text, but it may contain HTML
588 # elements coded as &gt; etc. Remove these tags.
589 s/&lt;/</sg;
590 s/&gt;/>/sg;
591
592 s/\s+/ /sg;
593 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
594 s/\s*<br>\s*/ LINEBREAK /isg;
595 s/<[^>]*>/ /sg;
596
597 # remove &amp; and other miscellaneous markup tags
598 s/&amp;/&/sg;
599 s/&lt;/</sg;
600 s/&gt;/>/sg;
601 s/&amp;/&/sg;
602
603 # replace<p> and <br> placeholders with carriage returns
604 s/PARAGRAPHBREAK/\n/sg;
605 s/LINEBREAK/\n/sg;
606
607
608# print STDERR "text:$_\n";
609 return $_;
610}
611
612# A version of convert_gml_to_tokens that is fine-tuned to the English language.
613
614sub convert_gml_to_tokens_EN {
615 $_ = shift @_;
616
617 # FIRST, remove GML tags
618
619 # Replace all whitespace with a simple space
620 s/\s+/ /gs;
621
622 # Remove everything that is in a tag
623 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
624 s/\s*<br>\s*/ LINEBREAK /isg;
625 s/<[^>]*>/ /sg;
626
627 # Now we have the text, but it may contain HTML
628 # elements coded as &gt; etc. Remove these tags.
629 s/&lt;/</sg;
630 s/&gt;/>/sg;
631
632 s/\s+/ /sg;
633 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
634 s/\s*<br>\s*/ LINEBREAK /isg;
635 s/<[^>]*>/ /sg;
636
637 # remove &amp; and other miscellaneous markup tags
638 s/&amp;/&/sg;
639 s/&lt;/</sg;
640 s/&gt;/>/sg;
641 s/&amp;/&/sg;
642
643 # replace<p> and <br> placeholders with carriage returns
644 s/PARAGRAPHBREAK/\n/sg;
645 s/LINEBREAK/\n/sg;
646
647
648 # Exceptional punctuation
649 #
650 # We make special cases of some punctuation
651
652 # remove any apostrophe that indicates omitted letters
653 s/(\w+)\'(\w*\s)/ $1$2 /g;
654
655 # remove period that appears in a person's initals
656 s/\s([A-Z])\./ $1 /g;
657
658 # replace hyphens in hypheanted words and names with a space
659 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
660
661 # Convert the remaining text to "clause format",
662 # This means removing all excess punctuation and garbage text,
663 # normalising valid punctuation to fullstops and commas,
664 # then putting one cluse on each line.
665
666 # Insert newline when the end of a sentence is detected
667 # (delimter is: "[\.\?\!]\s")
668 s/\s*[\.\?\!]\s+/\n/g;
669
670 # split numbers after four digits
671 s/(\d\d\d\d)/$1 /g;
672
673 # split words after 32 characters
674
675 # squash repeated punctuation
676 tr/A-Za-z0-9 //cs;
677
678 # save email addresses
679 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
680
681 # normalise clause breaks (mostly punctuation symbols) to commas
682 s/[^A-Za-z0-9 \n]+/ , /g;
683
684 # Remove repeated commas, and replace with newline
685 s/\s*,[, ]+/\n/g;
686
687 # remove extra whitespace
688 s/ +/ /sg;
689 s/^\s+//mg;
690 s/\s*$/\n/mg;
691
692 # remove lines that contain one word or less
693 s/^\w*$//mg;
694 s/^\s*$//mg;
695 tr/\n//s;
696
697 return $_;
698
699}
700
701
702
703# Execute a system command
704
705sub execute {
706 my ($command, $verbosity, $outhandle) = @_;
707 print $outhandle "Executing: $command\n" if ($verbosity > 2);
708 $! = 0;
709 my $status = system($command);
710 if ($status != 0) {
711 print STDERR "Phind - Error executing '$command': $!\n";
712 exit($status); # this causes the build to fail...
713 }
714}
715
716
717# Generate the vocabulary, symbol statistics, and numbers file from the
718# clauses file. This is legacy code, so is a bit messy and probably wont
719# run under windows.
720
721sub extract_vocabulary {
722 my ($self) = @_;
723
724 my $verbosity = $self->{'verbosity'};
725 my $out = $self->{'outhandle'};
726
727 my $collectiondir = $self->{'collectiondir'};
728 my $phinddir = $self->{'phinddir'};
729
730 my $language_exp = $self->{'language_exp'};
731
732 my ($w, $l, $line, $word);
733
734 my ($first_delimiter, $last_delimiter,
735 $first_stopword, $last_stopword,
736 $first_extractword, $last_extractword,
737 $first_contentword, $last_contentword,
738 $phrasedelimiter);
739
740 my $thesaurus = $self->{'thesaurus'};
741 my ($thesaurus_links, $thesaurus_terms,
742 %thesaurus, $first_thesaurusword, $last_thesaurusword);
743
744 my %symbol;
745 my (%freq);
746
747 print $out "Calculating vocabulary\n" if ($verbosity > 1);
748
749 # Read and store the stopwords
750 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
751 my $stopword_files = ();
752 my ($language, $language_dir, $file, $file_name);
753 my %stopwords;
754
755 # Examine each directory in the stopword directory
756 opendir(STOPDIR, $stopdir);
757 foreach $language (readdir STOPDIR) {
758
759 # Ignore entries that do not match the classifier's language
760 next unless ($language =~ /$language_exp/);
761 $language_dir = &util::filename_cat($stopdir, $language);
762 next unless (-d "$language_dir");
763
764 opendir(LANGDIR, $language_dir);
765 foreach $file (readdir LANGDIR) {
766
767 # Ignore entries that are not stopword files
768 next unless ($file =~ /sw$/);
769 $file_name = &util::filename_cat($language_dir, $file);
770 next unless (-f "$file_name");
771
772 # Read the stopwords
773 open(STOPFILE, "<$file_name");
774 while (<STOPFILE>) {
775 s/^\s+//;
776 s/\s.*//;
777 $word = $_;
778 $l = lc($word);
779 $stopwords{$l} = $word;
780 }
781 close STOPFILE;
782
783 }
784 closedir LANGDIR;
785 }
786 closedir STOPDIR;
787
788 # Read thesaurus information
789 if ($thesaurus) {
790
791 # link file exists
792 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
793 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
794
795 # ensure term file exists in the correct language
796 if ($language_exp =~ /^([a-z][a-z])/) {
797 $language = $1;
798 } else {
799 $language = 'en';
800 }
801 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
802 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
803
804
805 # Read the thesaurus terms
806 open(TH, "<$thesaurus_terms");
807 while(<TH>) {
808 s/^\d+ //;
809 s/\(.*\)//;
810 foreach $w (split(/\s+/, $_)) {
811 $thesaurus{lc($w)} = $w;
812 }
813 }
814 close TH;
815 }
816
817 # Read words in the text and count occurences
818 open(TXT, "<$phinddir/clauses");
819
820 my @words;
821 while(<TXT>) {
822 $line = $_;
823 next unless ($line =~ /./);
824
825 @words = split(/\s+/, $line);
826 foreach $w (@words) {
827 $l = lc($w);
828 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
829 $freq{$w}++;
830 }
831 $freq{$senlimit}++;
832 }
833
834 close TXT;
835
836 # Calculate the "best" form of each word
837 my (%bestform, %totalfreq, %bestfreq);
838
839 foreach $w (sort (keys %freq)) {
840 $l = lc($w);
841
842 # totalfreq is the number of times a term appears in any form
843 $totalfreq{$l} += $freq{$w};
844
845 if (defined $stopwords{$l}) {
846 $bestform{$l} = $stopwords{$l};
847
848 } elsif (defined $thesaurus{$l}) {
849 $bestform{$l} = $thesaurus{$l};
850
851 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
852 $bestfreq{$l} = $freq{$w};
853 $bestform{$l} = $w;
854 }
855 }
856 undef %freq;
857 undef %bestfreq;
858
859
860 # Assign symbol numbers to tokens
861 my $nextsymbol = 1;
862 my (@vocab);
863
864 # Delimiters
865 $first_delimiter = 1;
866
867 foreach $word (@delimiters) {
868
869# $word = lc($word); # jrm21
870 $word = uc($word);
871 $bestform{$word} = $word;
872 $vocab[$nextsymbol] = $word;
873 $symbol{$word} = $nextsymbol;
874 $nextsymbol++;
875 }
876 $last_delimiter = $nextsymbol - 1;
877 # Stopwords
878 $first_stopword = $nextsymbol;
879
880 foreach my $word (sort keys %stopwords) {
881 # don't include stopword unless it occurs in the text
882 $word = lc($word);
883 next unless ($totalfreq{$word});
884 next if ($symbol{$word});
885
886 $vocab[$nextsymbol] = $word;
887 $symbol{$word} = $nextsymbol;
888 $nextsymbol++;
889 }
890 $last_stopword = $nextsymbol - 1;
891 $first_contentword = $nextsymbol;
892
893 # Thesaurus terms
894 if ($thesaurus) {
895 $first_thesaurusword = $nextsymbol;
896
897 foreach my $word (sort keys %thesaurus) {
898
899 $word = lc($word);
900 next if ($symbol{$word});
901 $bestform{$word} = $thesaurus{$word};
902
903 $vocab[$nextsymbol] = $word;
904 $symbol{$word} = $nextsymbol;
905 $nextsymbol++;
906
907 }
908 $last_thesaurusword = $nextsymbol - 1;
909 }
910
911 # Other content words
912 $first_extractword = $nextsymbol;
913
914 foreach my $word (sort (keys %bestform)) {
915
916 next if ($symbol{$word});
917
918 $vocab[$nextsymbol] = $word;
919 $symbol{$word} = $nextsymbol;
920 $nextsymbol++;
921 }
922 $last_extractword = $nextsymbol - 1;
923 $last_contentword = $nextsymbol - 1;
924
925 # Outut the words
926 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
927 open(VOC, ">$phinddir/clauses.vocab");
928
929 for (my $i = 1; $i < $nextsymbol; $i++) {
930 $w = $vocab[$i];
931
932 print VOC "$bestform{$w}\n";
933 $totalfreq{$w} = 0 unless ($totalfreq{$w});
934 }
935 close VOC;
936
937
938 # Create statistics file
939 # Output statistics about the vocablary
940 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
941 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
942
943 open(STAT, ">$phinddir/clauses.stats")
944 || die "Cannot open $phinddir/clauses.stats: $!";
945
946 print STAT "first_delimiter $first_delimiter\n";
947 print STAT "last_delimiter $last_delimiter\n";
948 print STAT "first_stopword $first_stopword\n";
949 print STAT "last_stopword $last_stopword\n";
950 if ($thesaurus) {
951 print STAT "first_thesaurusword $first_thesaurusword\n";
952 print STAT "last_thesaurusword $last_thesaurusword\n";
953 }
954 print STAT "first_extractword $first_extractword\n";
955 print STAT "last_extractword $last_extractword\n";
956 print STAT "first_contentword $first_contentword\n";
957 print STAT "last_contentword $last_contentword\n";
958 print STAT "first_symbol $first_delimiter\n";
959 print STAT "last_symbol $last_contentword\n";
960 print STAT "first_word $first_stopword\n";
961 print STAT "last_word $last_contentword\n";
962 close STAT;
963
964 undef @vocab;
965
966
967 # Create numbers file
968 # Save text as symbol numbers
969 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
970
971 open(TXT, "<$phinddir/clauses");
972 open(NUM, ">$phinddir/clauses.numbers");
973
974## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
975## print NUM "$symbol{lc($colstart)}\n"; # jrm21
976 $phrasedelimiter = $symbol{$senlimit};
977 print NUM "$symbol{$colstart}\n";
978
979 # set up the special symbols that delimit documents and sentences
980 while(<TXT>) {
981
982 # split sentence into a list of tokens
983 $line = $_;
984 next unless ($line =~ /./);
985 @words = split(/\s+/, $line);
986
987 # output one token at a time
988 foreach $word (@words) {
989# don't lower-case special delimiters - jrm21
990 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
991 $word = lc($word);
992 }
993 print NUM "$symbol{$word}\n";
994 }
995
996 # output phrase delimiter
997 print NUM "$phrasedelimiter\n";
998 }
999
1000 close TXT;
1001# print NUM "$symbol{lc($colend)}\n";# jrm21
1002 print NUM "$symbol{$colend}\n";
1003 close NUM;
1004
1005 # Save thesaurus data in one convienient file
1006 if ($thesaurus) {
1007
1008 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1009
1010
1011 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
1012 if ($verbosity > 1);
1013
1014 # Read the thesaurus terms
1015 my ($num, $text, %thes_symbols);
1016
1017 open(TH, "<$thesaurus_terms");
1018 while(<TH>) {
1019 chomp;
1020 @words = split(/\s+/, $_);
1021 $num = shift @words;
1022 $text = "";
1023
1024 # translate words into symbol numbers
1025 foreach $word (@words) {
1026 $word = lc($word);
1027 if ($symbol{$word}) {
1028 $text .= "s$symbol{$word} ";
1029 } elsif ($verbosity) {
1030 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1031 }
1032 }
1033 $text =~ s/ $//;
1034 $thes_symbols{$num} = $text;
1035 }
1036 close TH;
1037
1038 # Read the thesaurus links and write the corresponding data
1039 open(TH, "<$thesaurus_links");
1040 open(THOUT, ">$thesaurusfile");
1041
1042 while(<TH>) {
1043 chomp;
1044 ($num, $text) = split(/:/, $_);
1045
1046 if (defined($thes_symbols{$num})) {
1047 print THOUT "$num:$thes_symbols{$num}:$text\n";
1048 } else {
1049 print THOUT "$num:untranslated:$text\n";
1050 }
1051 }
1052 close TH;
1053 close THOUT;
1054 }
1055
1056
1057
1058
1059}
1060
1061
1062# renumber_phrases
1063#
1064# Prepare the phrases file to be input to mgpp. The biggest problem is
1065# reconciling the phrase identifiers used by the suffix program (which
1066# we'll call suffix-id numbers) with the numbers used in the thesaurus
1067# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1068# Phind-id numbers must be sorted by frequency of occurance.
1069#
1070# Start creating a set of phind-id numbers from the sorted suffix-id
1071# numbers and (if required) the thesaurus-id numbers. Then add any other
1072# phrases occuring in the thesaurus.
1073#
1074# The last thing we have to do is restore the vocabulary information to the
1075# phrase file so that the phrases are stored as words, not as symbol
1076# numbers.
1077
1078# The original phrases file looks something like this:
1079# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1080# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1081# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1082# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1083
1084
1085sub renumber_phrases {
1086 my ($self) = @_;
1087
1088 renumber_suffix_data($self);
1089 renumber_thesaurus_data($self);
1090 restore_vocabulary_data($self);
1091
1092}
1093
1094
1095
1096# renumber_suffix_data
1097#
1098# Translate phrases file to phrases.2 using phind keys instead
1099# of suffix keys and sorting the expansion data.
1100
1101sub renumber_suffix_data {
1102 my ($self) = @_;
1103
1104 my $verbosity = $self->{'verbosity'};
1105 my $out = $self->{'outhandle'};
1106 print $out "Translate phrases: suffix-ids become phind-id's\n"
1107 if ($verbosity);
1108
1109 my $phinddir = $self->{'phinddir'};
1110 my $infile = &util::filename_cat($phinddir, 'phrases');
1111 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1112
1113 # Read the phrase file. Calculate initial set of phind-id
1114 # numbers and store (suffixid -> frequency) relation.
1115
1116 my %suffixtophind;
1117 my @phindfrequency;
1118 my (@fields, $suffixid);
1119 my $nextphind = 1;
1120
1121 open(IN, "<$infile");
1122 while(<IN>) {
1123
1124 chomp;
1125 @fields = split(/:/, $_);
1126
1127 # get next suffixid and phindid
1128 $suffixid = shift @fields;
1129 $suffixtophind{$suffixid} = $nextphind;
1130
1131 # store total frequency
1132 shift @fields;
1133 $totalfrequency[$nextphind] = shift @fields;
1134
1135 $nextphind++;
1136 }
1137 close IN;
1138
1139
1140 # Translate phrases file to phrases.2. Use phind keys (not suffix
1141 # keys), sort expansion and document occurance data in order of
1142 # descending frequency..
1143 open(IN, "<$infile");
1144 open(OUT, ">$outfile");
1145
1146 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1147 my (@documwents, @newexp, $k, $n);
1148 my $linenumber = 0;
1149
1150 while(<IN>) {
1151
1152 # read the line
1153 chomp;
1154 @fields = split(/:/, $_);
1155
1156 # get a phrase number for this line
1157 $suffixid = shift @fields;
1158 die unless (defined($suffixtophind{$suffixid}));
1159 $phindid = $suffixtophind{$suffixid};
1160
1161 # get the symbols in the phrase
1162 $text = shift @fields;
1163
1164 # output status information
1165 $linenumber++;
1166 if ($verbosity > 2) {
1167 if ($linenumber % 1000 == 0) {
1168 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1169 }
1170 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1171 }
1172
1173 # get the phrase frequency
1174 $tf = shift @fields;
1175
1176 # get the number of expansions
1177 $countexp = shift @fields;
1178
1179 # get the expansions, convert them into phind-id numbers, and sort them
1180 $expansions = shift @fields;
1181 @newexp = ();
1182 foreach $k (split(/,/, $expansions)) {
1183 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1184 $n = $suffixtophind{$k};
1185 push @newexp, $n;
1186 }
1187 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1188
1189 # get the number of documents
1190 $countdocs = shift @fields;
1191
1192 # get the documents and sort them
1193 $documents = shift @fields;
1194 $documents =~ s/d//g;
1195 @documents = split(/;/, $documents);
1196 @documents = sort by_doc_frequency @documents;
1197
1198 # output the phrase data
1199 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1200 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1201
1202 }
1203
1204 close IN;
1205 close OUT;
1206}
1207
1208
1209# renumber_thesaurus_data
1210#
1211# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1212
1213sub renumber_thesaurus_data {
1214 my ($self) = @_;
1215
1216 my $out = $self->{'outhandle'};
1217 my $verbosity = $self->{'verbosity'};
1218 my $thesaurus = $self->{'thesaurus'};
1219
1220 my $phinddir = $self->{'phinddir'};
1221 my $infile = &util::filename_cat($phinddir, "phrases.2");
1222 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1223
1224
1225 # If no thesaurus is defined, simply move the phrases file.
1226 if (!$thesaurus) {
1227 print $out "Translate phrases.2: no thesaurus data\n"
1228 if ($verbosity);
1229 &util::mv($infile, $outfile);
1230 return;
1231 }
1232
1233 print $out "Translate phrases.2: add thesaurus data\n"
1234 if ($verbosity);
1235
1236 # 1.
1237 # Read thesaurus file and store (symbols->thesaurusid) mapping
1238 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1239 my %symbolstothesid;
1240 my (@fields, $thesid, $symbols);
1241
1242 open(TH, "<$thesaurusfile");
1243
1244 while (<TH>) {
1245
1246 chomp;
1247 @fields = split(/:/, $_);
1248
1249 # get id and text
1250 $thesid = shift @fields;
1251 $symbols = shift @fields;
1252 $symbolstothesid{$symbols} = $thesid;
1253 }
1254 close TH;
1255
1256 # 2.
1257 # Read phrases file to find thesaurus entries that already
1258 # have a phindid. Store their phind-ids for later translation,
1259 # and store their frequency for later sorting.
1260 my %thesaurustophindid;
1261 my %phindidtofrequency;
1262 my ($phindid, $freq);
1263
1264 open(IN, "<$infile");
1265
1266 while(<IN>) {
1267
1268 chomp;
1269 @fields = split(/:/, $_);
1270
1271 # phindid and symbols for this line
1272 $phindid = shift @fields;
1273 $symbols = shift @fields;
1274 $freq = shift @fields;
1275
1276 # do we have a thesaurus id corresponding to this phrase?
1277 if (defined($symbolstothesid{$symbols})) {
1278 $thesid = $symbolstothesid{$symbols};
1279 $thesaurustophindid{$thesid} = $phindid;
1280 $phindidtofrequency{$phindid} = $freq;
1281 }
1282 }
1283 close IN;
1284
1285 undef %symbolstothesid;
1286
1287 # 3.
1288 # Create phind-id numbers for remaining thesaurus entries,
1289 # and note that their frequency is 0 for later sorting.
1290 my $nextphindid = $phindid + 1;
1291
1292 open(TH, "<$thesaurusfile");
1293 while(<TH>) {
1294
1295 chomp;
1296 @fields = split(/:/, $_);
1297
1298 # read thesaurus-id and ensure it has a corresponding phind-id
1299 $thesid = shift @fields;
1300 if (!defined($thesaurustophindid{$thesid})) {
1301 $thesaurustophindid{$thesid} = $nextphindid;
1302 $phindidtofrequency{$nextphindid} = 0;
1303 $nextphindid++;
1304 }
1305 }
1306 close TH;
1307
1308 # 4.
1309 # Translate thesaurus file, replacing thesaurus-id numbers with
1310 # phind-id numbers.
1311 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1312 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1313 my (@links, $linkid, %linkidtotype, $newrelation);
1314
1315 open(TH, "<$thesaurusfile");
1316 open(TO, ">$newthesaurusfile");
1317 while(<TH>) {
1318
1319 chomp;
1320 @fields = split(/:/, $_);
1321
1322 # phindid and symbols for this line
1323 ($thesid, $symbols, $relations) = @fields;
1324
1325 die unless ($thesid && $symbols);
1326 die unless $thesaurustophindid{$thesid};
1327 $phindid = $thesaurustophindid{$thesid};
1328
1329 # convert each part of the relation string to use phind-id numbers
1330 # at the same time, we want to sort the list by frequency.
1331 undef %linkidtotype;
1332
1333 foreach $linktext (split(/;/, $relations)) {
1334 @linkdata = split(/,/, $linktext);
1335
1336 # remember the linktype (e.g. BT, NT)
1337 $linktype = shift @linkdata;
1338
1339 # store the type of each link
1340 foreach $thesid (@linkdata) {
1341 die unless (defined($thesaurustophindid{$thesid}));
1342 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1343 }
1344 }
1345
1346 # sort the list of links, first by frequency, then by type.
1347 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1348 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1349 $linkcounter = (scalar @links);
1350
1351 # create a string describing the link information
1352 $linktype = $linkidtotype{$links[0]};
1353 $newrelation = $linktype;
1354 foreach $linkid (@links) {
1355 if ($linkidtotype{$linkid} ne $linktype) {
1356 $linktype = $linkidtotype{$linkid};
1357 $newrelation .= ";" . $linktype;
1358 }
1359 $newrelation .= "," . $linkid;
1360 }
1361 $newrelation .= ";";
1362
1363
1364 # output the new line
1365 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1366 }
1367 close TH;
1368 close TO;
1369
1370 undef %thesaurustophindid;
1371 undef %linkidtotype;
1372 undef %phindidtofrequency;
1373
1374 # 5.
1375 # Read thesaurus data (in phind-id format) into memory
1376 my %thesaurusdata;
1377
1378 open(TH, "<$newthesaurusfile");
1379 while(<TH>) {
1380 chomp;
1381 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1382 die unless ($phindid && $symbols);
1383 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1384 }
1385 close TH;
1386
1387 # 6.
1388 # Add thesaurus data to phrases file
1389 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1390 my (@documwents, @newexp, $k, $n);
1391 my $linenumber = 0;
1392
1393 open(IN, "<$infile");
1394 open(OUT, ">$outfile");
1395
1396 # Update existing phrases
1397 while(<IN>) {
1398
1399 chomp;
1400 @fields = split(/:/, $_);
1401
1402 # get data for this line
1403 $phindid = shift @fields;
1404
1405 # output the phrase data, with thesaurus information
1406 print OUT "$phindid:", join(":", @fields);
1407
1408 # add thesaurus data
1409 if (defined($thesaurusdata{$phindid})) {
1410 @fields = split(/:/, $thesaurusdata{$phindid});
1411 shift @fields;
1412 $linkcounter = shift @fields;
1413 $relations = shift @fields;
1414
1415 print OUT ":$linkcounter:$relations";
1416 $thesaurusdata{$phindid} = "";
1417 }
1418 print OUT "\n";
1419 }
1420 close IN;
1421
1422 # Add phrases that aren't already in the file
1423 foreach $phindid (sort numerically keys %thesaurusdata) {
1424 next unless ($thesaurusdata{$phindid});
1425
1426 @fields = split(/:/, $thesaurusdata{$phindid});
1427 $symbols = shift @fields;
1428 $linkcounter = shift @fields;
1429 $relations = shift @fields;
1430
1431 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1432 }
1433 close OUT;
1434
1435}
1436
1437# restore_vocabulary_data
1438#
1439# Read phrases.3 and restore vocabulary information. Then write
1440# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1441# (if requested) to the saved phrases file.
1442
1443sub restore_vocabulary_data {
1444 my ($self) = @_;
1445
1446 my $out = $self->{'outhandle'};
1447 my $verbosity = $self->{'verbosity'};
1448 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1449
1450 my $phinddir = $self->{'phinddir'};
1451 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1452 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1453 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1454 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1455
1456 my $savephrases = $self->{'savephrases'};
1457
1458 # 1.
1459 # Read the vocabulary file
1460 open(V, "<$vocabfile")
1461 || die "Cannot open $vocabfile: $!";
1462 my @symbol;
1463 my $i = 1;
1464 while(<V>) {
1465 chomp;
1466 $symbol[$i++] = $_;
1467 }
1468 close V;
1469
1470 # 2.
1471 # Translate phrases.3 to MGPP input files
1472 my ($key, $text, $word, $isThesaurus);
1473 my @fields;
1474 my $linenumber = 0;
1475
1476 open(IN, "<$infile");
1477 open(DATA, ">$datafile");
1478 open(WORD, ">$wordfile");
1479
1480 # Save the phrases in a separate text file
1481 if ($savephrases) {
1482 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1483 open(SAVE, ">$savephrases");
1484 }
1485
1486 while(<IN>) {
1487
1488 # read the line
1489 chomp;
1490 $line = $_;
1491 @fields = split(/:/, $line);
1492
1493 # get a phrase number for this line
1494 $key = shift @fields;
1495
1496 # restore the text of the phrase
1497 $text = shift @fields;
1498 $text =~ s/s(\d+)/$symbol[$1]/g;
1499 if ($text =~ / /) {
1500 $word = "";
1501 } elsif ($text ne 'untranslated') {
1502 $word = $text;
1503 }
1504
1505 # output the phrase data
1506 print DATA "<Document>";
1507 print DATA "$key:$text:", join(":", @fields), ":\n";
1508
1509 # output the word index search data
1510 print WORD "<Document>$word\n";
1511
1512 # output the phrases to a text file
1513 if ($savephrases) {
1514 if ((scalar @fields) == 7) {
1515 $isThesaurus = 1;
1516 } else {
1517 $isThesaurus = 0;
1518 }
1519 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1520 }
1521 }
1522 close IN;
1523 close WORD;
1524 close DATA;
1525 close SAVE if ($savephrases);
1526
1527}
1528
1529
1530
1531# sort routines used to renumber phrases
1532
1533sub numerically { $a <=> $b }
1534
1535sub by_doc_frequency {
1536 my $fa = 1;
1537 if ($a =~ /,/) {
1538 $fa = $a;
1539 $fa =~ s/\d+,//;
1540 }
1541 my $fb = 1;
1542 if ($b =~ /,/) {
1543 $fb = $b;
1544 $fb =~ s/\d+,//;
1545 }
1546
1547 return ($fb <=> $fa);
1548}
1549
15501;
Note: See TracBrowser for help on using the repository browser.