source: trunk/gsdl/perllib/classify/phind.pm@ 2576

Last change on this file since 2576 was 2576, checked in by sjboddie, 23 years ago

Moved phind's stopword directory from etc to etc/packages/phind

  • Property svn:keywords set to Author Date Id Revision
File size: 38.1 KB
RevLine 
[1643]1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
[1890]29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
[1643]32
33package phind;
34
35use BasClas;
36use util;
[1897]37use ghtml;
38use unicode;
[1643]39
[2509]40my @removedirs = ();
41
42my %wanted_index_files = ('td'=>1,
43 't'=>1,
44 'ti'=>1,
45 'tl'=>1,
46 'tsd'=>1,
47 'idb'=>1,
48 'ib1'=>1,
49 'ib2'=>1,
50 'ib3'=>1,
51 'i'=>1,
52 'il'=>1,
53 'w'=>1,
54 'wa'=>1);
55
[1643]56sub BEGIN {
57 @ISA = ('BasClas');
58}
59
[2509]60sub END {
61
62 # Tidy up stray files - we do this here as there's some weird problem
63 # preventing us from doing it in the get_classify_info() function (on
64 # windows at least) where the close() appears to fail on txthandle and
65 # dochandle, thus preventing us from deleting those files
[1643]66
[2509]67 foreach my $dir (@removedirs) {
68 if (-d $dir && opendir (DIR, $dir)) {
69 my @files = readdir DIR;
70 closedir DIR;
71
72 foreach $file (@files) {
73 next if $file =~ /^\.\.?$/;
74 my ($suffix) = $file =~ /\.([^\.]+)$/;
75 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
76 # delete it!
77 &util::rm (&util::filename_cat ($dir, $file));
78 }
79 }
80 }
81 }
82}
83
[1839]84sub print_usage {
85 print STDERR "
86 usage: classify phind [options]
87
88 options:
[1883]89 -text Fields The text used to build the phrase hierarchy.
90 (default: 'section:Title,section:text')
[1839]91
[1883]92 -title Title The metadata field used to describe each document.
93 (default: 'Title')
94
95 -button Name The label for the classifier screen and button in
96 navigation bar.
97 (default: 'Phrase')
98
99 -language Regex Language or languages to use building hierarchy.
100 Languages are identified by two-letter country codes
101 like en (English), es (Spanish), and fr (French).
102 Language is a regular expression, so 'en|fr' (English or
103 French) and '..' (match any language) are valid.
104 (default: 'en'.)
105
106 -savephrases File If set, the phrase infomation will be stored in
107 the given file as text. It is probably a good idea
108 to use an absolute path.
109 (defualt: not set)
110
111 -suffixmode N The smode parameter to the phrase extraction program. A
112 value of 0 means that stopwords are ignored, and of 1
113 means that stopwords are used.
114 (default: 1)
115
116 -thesaurus Name Name of a thesaurus stored in phind format in the
117 collection's etc directory.
118 (default: not set)
119
120 -untidy Don't remove working files.
121
122"; }
123
[1890]124# Phrase delimiter symbols - these should be abstracted out someplace
125
126my $colstart = "COLLECTIONSTART";
127my $colend = "COLLECTIONEND";
128my $doclimit = "DOCUMENTLIMIT";
129my $senlimit = "SENTENCELIMIT";
130my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
131
132
[1839]133# Create a new phind browser based on collect.cfg
134
[1643]135sub new {
[1839]136 my $class = shift (@_);
137 my $self = new BasClas($class, @_);
[1643]138
[1839]139 my $out = $self->{'outhandle'};
140
[1643]141 # Ensure the Phind generate scripts are in place
142 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
[2487]143 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
[1643]144 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
145
146 if (!(-e $file1)) {
[2500]147 print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
[1643]148 exit(1);
149 }
150
[1839]151 # Parse classifier arguments
[1803]152 my $builddir = "";
[1839]153 if (!parsargv::parse(\@_,
154 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155 q^title/.*/Title^, \$self->{'titlefield'},
156 q^button/.*/Phrase^, \$self->{'buttonname'},
[1871]157 q^language/.*/en^, \$language,
[1839]158 q^builddir/.*/^, \$builddir,
159 q^savephrases/\d/0^, \$self->{'savephrases'},
160 q^suffixmode/\d/1^, \$self->{'suffixmode'},
161 q^thesaurus/.*/^, \$self->{'thesaurus'},
162 q^untidy^, \$self->{'untidy'},
163 "allow_extra_options")) {
164
165 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166 &print_usage();
167 die "\n";
[1643]168 }
[1839]169
[1803]170 # classifier information
[1839]171 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
[1646]172
[1643]173 # limit languages
174 $self->{'language_exp'} = $language;
[1839]175
176 # collection directories
[1829]177 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
[1803]178 if (!$builddir) {
179 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
[1643]180 }
[1803]181 $self->{'builddir'} = $builddir;
[2487]182
183 $self->{'total'} = 0;
[1839]184
[1643]185 return bless $self, $class;
186}
187
188
189# Initialise the phind classifier
190
191sub init {
192 my $self = shift (@_);
193
[1803]194 # ensure we have a build directory
195 my $builddir = $self->{'builddir'};
196 die unless (-e "$builddir");
197
198 # create phind directory
[2025]199 my $phnumber = 1;
200 my $phinddir = &util::filename_cat($builddir, "phind1");
201 while (-e "$phinddir") {
202 $phnumber++;
203 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
[1803]204 }
205 &util::mk_dir("$phinddir");
[2025]206 $self->{'phinddir'} = $phinddir;
207 $self->{'phindnumber'} = $phnumber;
[1803]208
[2509]209 push(@removedirs, $phinddir) unless $self->{'untidy'};
210
[1643]211 # open filehandles for documents and text
[1803]212 my $clausefile = &util::filename_cat("$phinddir", "clauses");
[1643]213 &util::rm($clausefile) if (-e $clausefile);
214
[2025]215 my $txthandle = 'TEXT' . $phnumber;
216 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
217 $self->{'txthandle'} = $txthandle;
218
[1803]219 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
[1643]220 &util::rm($docfile) if (-e $docfile);
[2025]221
222 my $dochandle = 'DOC' . $phnumber;
223 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
224 $self->{'dochandle'} = $dochandle;
[1643]225
226}
227
[1890]228
[1643]229# Classify each document.
230#
231# Each document is passed here in turn. The classifier extracts the
232# text of each and stores it in the clauses file. Document details are
233# stored in the docs.txt file.
234
235sub classify {
236 my $self = shift (@_);
[2025]237 my $doc_obj = shift @_;
[1643]238
239 my $verbosity = $self->{'verbosity'};
240 my $top_section = $doc_obj->get_top_section();
241
[1803]242 my $titlefield = $self->{'titlefield'};
243
244 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[1643]245 print "process: $title\n" if ($verbosity > 2);
246
[1883]247 # Only consider the file if it is in the correct language
[1643]248 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249 my $phrlanguage = $self->{'language_exp'};
250 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252 # record this file
[2487]253 $self->{'total'} ++;
254 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[1643]255
256
257 # Store document details
258 my $OID = $doc_obj->get_OID();
259 $OID = "NULL" unless defined $OID;
260 my $dochandle = $self->{'dochandle'};
261 print $dochandle "<Document>\t$OID\t$title\n";
262
263 # Store the text occuring in this object
264
265 # output the document delimiter
266 my $txthandle = $self->{'txthandle'};
267 print $txthandle "$doclimit\n";
268
269 # iterarate over the required indexes and store their text
270 my $indexes = $self->{'indexes'};
271 my $text = "";
[1890]272 my ($part, $level, $field, $section, $data, $dataref);
[1643]273
274 foreach $part (split(/,/, $indexes)) {
275
276 # Each field has a level and a data element ((e.g. document:Title)
277 ($level, $field) = split(/:/, $part);
278 die unless ($level && $field);
[2025]279
[1643]280 # Extract the text from every section
281 # (In phind, document:text and section:text are equivalent)
282 if ($field eq "text") {
283 $data = "";
284 $section = $doc_obj->get_top_section();
285 while (defined($section)) {
286 $data .= $doc_obj->get_text($section) . "\n";
287 $section = $doc_obj->get_next_section($section);
288 }
[1949]289 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
[1643]290 }
291
292 # Extract a metadata field from a document
[1890]293 # (If ther eis more than one element of the given type, get them all.)
[1643]294 elsif ($level eq "document") {
[1890]295 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
[2206]296 foreach $data (@$dataref) {
[1949]297 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
[1890]298 }
299 }
300
[1643]301 # Extract metadata from every section in a document
302 elsif ($level eq "section") {
303 $data = "";
304 $section = $doc_obj->get_top_section();
305 while (defined($section)) {
[2025]306 $dataref = $doc_obj->get_metadata($section, $field);
307 $data .= join("\n", @$dataref) . "\n";
[1643]308 $section = $doc_obj->get_next_section($section);
309 }
[1949]310 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
[1643]311 }
312
313 # Some sort of specification which I don't understand
314 else {
315 die "Unknown level ($level) in phind index ($part)\n";
316 }
317
318 }
[2025]319
[1643]320 # output the text
321 $text =~ tr/\n//s;
322 print $txthandle "$text";
323}
324
325
326# Construct the classifier from the information already gathered
327#
328# When get_classify_info is called, the clauses and docs.txt files have
[1803]329# already been constructed in the phind directory. This function will
[1643]330# translate them into compressed, indexed MGPP files that can be read by
331# the phindcgi script. It will also register our classifier so that it
[1803]332# shows up in the navigation bar.
[1643]333
334sub get_classify_info {
335 my $self = shift (@_);
336
[2487]337 close $self->{'dochandle'};
[2333]338 close $self->{'txthandle'};
[1643]339 my $verbosity = $self->{'verbosity'};
[1839]340 my $out = $self->{'outhandle'};
[1883]341 my $phinddir = $self->{'phinddir'};
[1839]342
[2487]343 my $osextra = "";
344 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
345 $osextra = " -d /";
346 }
347
[1645]348 if ($verbosity) {
[1839]349 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
[2539]350 print $out "*** in ", $self->{'phinddir'}, "\n";
[1645]351 }
352
[1643]353 # Construct phind indexes
[1646]354 my $suffixmode = $self->{'suffixmode'};
[1643]355 my ($command, $status);
356
357 # Generate the vocabulary, symbol statistics, and numbers file
358 # from the clauses file
[1839]359 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[1829]360 &extract_vocabulary($self);
[1643]361
[1803]362 # Use the suffix program to generate the phind/phrases file
[1839]363 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[2529]364 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
[1643]365
[1803]366 # Create the phrase file and put phrase numbers in phind/phrases
[2025]367 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[1808]368 &renumber_phrases($self);
[1643]369
[1839]370 print $out "\nCreating phrase databases\n";
[1803]371 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
[2487]372 my $mg_stem = &util::filename_cat($phinddir, "pdata");
[1643]373
[2529]374 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
375 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
376 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
[1643]377
378 # create the mg index of words
[1839]379 print $out "\nCreating word-level search indexes\n";
[1803]380 $mg_input = &util::filename_cat($phinddir, "pword.txt");
[2487]381 $mg_stem = &util::filename_cat($phinddir, "pword");
[1643]382
[2529]383 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
384 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
385 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
386 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
387 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
388 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
[1643]389
[2529]390 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
391 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
392 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
[1643]393
394 # create the mg document information database
[1839]395 print $out "\nCreating document information databases\n";
[1803]396 $mg_input = &util::filename_cat($phinddir, "docs.txt");
[2487]397 $mg_stem = &util::filename_cat($phinddir, "docs");
[1643]398
[2529]399 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
400 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
401 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
[1643]402
[2025]403 # Return the information about the classifier that we'll later want to
404 # use to create macros when the Phind classifier document is displayed.
[1643]405 my %classifyinfo = ('thistype'=>'Invisible',
[2025]406 'childtype'=>'Phind',
407 'Title'=>$self->{'buttonname'},
408 'parameters'=>"phindnumber=$self->{'phindnumber'}",
[1643]409 'contains'=>[]);
410
[2025]411 my $collection = $self->{'collection'};
412 my $url = "library?a=p&p=phind&c=$collection";
[1643]413 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
[2025]414
[1643]415 return \%classifyinfo;
416}
417
418
419
420sub convert_gml_to_tokens {
421
[1949]422 my ($language_exp, $text) = @_;
[1643]423
[1897]424 if ($language_exp =~ /en/) {
[1949]425 return &convert_gml_to_tokens_EN($text);
[1897]426 }
427
[1890]428 $_ = $text;
[1643]429
[2008]430 # 1. remove GML tags
[1643]431
432 # Remove everything that is in a tag
[1890]433 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
434 s/\s*<br>\s*/ LINEBREAK /isgo;
435 s/<[^>]*>/ /sgo;
[1643]436
437 # Now we have the text, but it may contain HTML
438 # elements coded as &gt; etc. Remove these tags.
[2008]439 s/&amp;/&/sgo;
[1890]440 s/&lt;/</sgo;
441 s/&gt;/>/sgo;
442 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
443 s/\s*<br>\s*/ LINEBREAK /isgo;
444 s/<[^>]*>/ /sgo;
[1643]445
[2008]446 # replace<p> and <br> placeholders with clause break symbol (\n)
447 s/\s+/ /gso;
[1890]448 s/PARAGRAPHBREAK/\n/sgo;
449 s/LINEBREAK/\n/sgo;
[1643]450
[1890]451
[1897]452
453
[2008]454 # 2. Split the remaining text into space-delimited tokens
[1897]455
[2008]456 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
457 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[1897]458
[2008]459 # Split text at word boundaries
460 s/\b/ /go;
461
462 # 3. Convert the remaining text to "clause format"
463
464 # Insert newline if the end of a sentence is detected
[1897]465 # (delimter is: "[\.\?\!]\s")
[2008]466 # s/\s*[\.\?\!]\s+/\n/go;
[1897]467
[2008]468 # remove unnecessary punctuation and replace with clause break symbol (\n)
469 s/[^\w ]/\n/go;
470
471 # remove extraneous whitespace
[1897]472 s/ +/ /sgo;
473 s/^\s+//mgo;
474 s/\s*$/\n/mgo;
[1643]475
[1897]476 # remove lines that contain one word or less
477 s/^\S*$//mgo;
478 s/^\s*$//mgo;
479 tr/\n//s;
[1643]480
[1897]481 return $_;
482}
[1643]483
[1897]484# A version of convert_gml_to_tokens that is fine-tuned to the English language.
[1643]485
[1897]486sub convert_gml_to_tokens_EN {
487 $_ = shift @_;
[1643]488
[1897]489 # FIRST, remove GML tags
[1643]490
[1897]491 # Replace all whitespace with a simple space
492 s/\s+/ /gs;
[1890]493
[1897]494 # Remove everything that is in a tag
495 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
496 s/\s*<br>\s*/ LINEBREAK /isg;
497 s/<[^>]*>/ /sg;
498
499 # Now we have the text, but it may contain HTML
500 # elements coded as &gt; etc. Remove these tags.
501 s/&lt;/</sg;
502 s/&gt;/>/sg;
503
504 s/\s+/ /sg;
505 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
506 s/\s*<br>\s*/ LINEBREAK /isg;
507 s/<[^>]*>/ /sg;
508
509 # remove &amp; and other miscellaneous markup tags
510 s/&amp;/&/sg;
511 s/&lt;/</sg;
512 s/&gt;/>/sg;
513 s/&amp;/&/sg;
514
515 # replace<p> and <br> placeholders with carriage returns
516 s/PARAGRAPHBREAK/\n/sg;
517 s/LINEBREAK/\n/sg;
518
519
520 # Exceptional punctuation
521 #
522 # We make special cases of some punctuation
523
524 # remove any apostrophe that indicates omitted letters
525 s/(\w+)\'(\w*\s)/ $1$2 /g;
526
527 # remove period that appears in a person's initals
528 s/\s([A-Z])\./ $1 /g;
529
530 # replace hyphens in hypheanted words and names with a space
531 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
532
[1643]533 # Convert the remaining text to "clause format",
534 # This means removing all excess punctuation and garbage text,
535 # normalising valid punctuation to fullstops and commas,
[1897]536 # then putting one cluse on each line.
[1643]537
538 # Insert newline when the end of a sentence is detected
539 # (delimter is: "[\.\?\!]\s")
[1897]540 s/\s*[\.\?\!]\s+/\n/g;
[1643]541
[1897]542 # split numbers after four digits
543 s/(\d\d\d\d)/$1 /g;
[1643]544
[1897]545 # split words after 32 characters
[1643]546
[1897]547 # squash repeated punctuation
548 tr/A-Za-z0-9 //cs;
[1643]549
[1897]550 # save email addresses
551 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
[1643]552
[1897]553 # normalise clause breaks (mostly punctuation symbols) to commas
554 s/[^A-Za-z0-9 \n]+/ , /g;
[1643]555
[1897]556 # Remove repeated commas, and replace with newline
557 s/\s*,[, ]+/\n/g;
[1643]558
559 # remove extra whitespace
[1897]560 s/ +/ /sg;
561 s/^\s+//mg;
562 s/\s*$/\n/mg;
[1643]563
564 # remove lines that contain one word or less
[1897]565 s/^\w*$//mg;
566 s/^\s*$//mg;
[1643]567 tr/\n//s;
568
569 return $_;
[1897]570
[1643]571}
572
573
[1897]574
[1643]575# Execute a system command
576
577sub execute {
[1839]578 my ($command, $verbosity, $outhandle) = @_;
579 print $outhandle "Executing: $command\n" if ($verbosity > 2);
[2487]580 $! = 0;
[1643]581 my $status = system($command);
582 if ($status != 0) {
[2487]583 print STDERR "phind - Error executing '$command': $!\n";
[1643]584 exit($status);
585 }
586}
587
588
589# Generate the vocabulary, symbol statistics, and numbers file from the
590# clauses file. This is legacy code, so is a bit messy and probably wont
591# run under windows.
592
593sub extract_vocabulary {
[1829]594 my ($self) = @_;
595
596 my $verbosity = $self->{'verbosity'};
[1839]597 my $out = $self->{'outhandle'};
[1643]598
[1829]599 my $collectiondir = $self->{'collectiondir'};
600 my $phinddir = $self->{'phinddir'};
601
[1883]602 my $language_exp = $self->{'language_exp'};
603
[1643]604 my ($w, $l, $line, $word);
[1829]605
[1643]606 my ($first_delimiter, $last_delimiter,
607 $first_stopword, $last_stopword,
608 $first_extractword, $last_extractword,
609 $first_contentword, $last_contentword,
610 $phrasedelimiter);
[1829]611
612 my $thesaurus = $self->{'thesaurus'};
613 my ($thesaurus_links, $thesaurus_terms,
614 %thesaurus, $first_thesaurusword, $last_thesaurusword);
[1643]615
616 my %symbol;
617 my (%freq);
618
[1829]619 print $out "Calculating vocabulary\n" if ($verbosity > 1);
[1643]620
621 # Read and store the stopwords
[2576]622 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
[1883]623 my $stopword_files = ();
624 my ($language, $language_dir, $file, $file_name);
[1643]625 my %stopwords;
[1883]626
627 # Examine each directory in the stopword directory
628 opendir(STOPDIR, $stopdir);
629 foreach $language (readdir STOPDIR) {
630
631 # Ignore entries that do not match the classifier's language
632 next unless ($language =~ /$language_exp/);
633 $language_dir = &util::filename_cat($stopdir, $language);
634 next unless (-d "$language_dir");
635
636 opendir(LANGDIR, $language_dir);
637 foreach $file (readdir LANGDIR) {
638
639 # Ignore entries that are not stopword files
640 next unless ($file =~ /sw$/);
641 $file_name = &util::filename_cat($language_dir, $file);
642 next unless (-f "$file_name");
643
644 # Read the stopwords
645 open(STOPFILE, "<$file_name");
646 while (<STOPFILE>) {
647 s/^\s+//;
648 s/\s.*//;
649 $word = $_;
650 $l = lc($word);
651 $stopwords{$l} = $word;
652 }
653 close STOPFILE;
654
655 }
[2333]656 closedir LANGDIR;
[1643]657 }
[2333]658 closedir STOPDIR;
[1883]659
[1829]660 # Read thesaurus information
661 if ($thesaurus) {
662
[1897]663 # link file exists
[1829]664 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
665 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
[1897]666
667 # ensure term file exists in the correct language
668 if ($language_exp =~ /^([a-z][a-z])/) {
669 $language = $1;
670 } else {
671 $language = 'en';
672 }
673 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
[1829]674 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
[1897]675
[1829]676
677 # Read the thesaurus terms
678 open(TH, "<$thesaurus_terms");
[1643]679 while(<TH>) {
680 s/^\d+ //;
681 s/\(.*\)//;
[1829]682 foreach $w (split(/\s+/, $_)) {
[1643]683 $thesaurus{lc($w)} = $w;
684 }
685 }
686 close TH;
687 }
688
689 # Read words in the text and count occurences
[1829]690 open(TXT, "<$phinddir/clauses");
[1643]691 my @words;
692
693 while(<TXT>) {
694 $line = $_;
695 next unless ($line =~ /./);
696
697 @words = split(/\s+/, $line);
698 foreach $w (@words) {
699 $l = lc($w);
700 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
701 $freq{$w}++;
702 }
703 $freq{$senlimit}++;
704 }
705
[2333]706 close TXT;
707
[1643]708 # Calculate the "best" form of each word
709 my (%bestform, %totalfreq, %bestfreq);
710
711 foreach $w (sort (keys %freq)) {
712 $l = lc($w);
713
714 # totalfreq is the number of times a term appears in any form
715 $totalfreq{$l} += $freq{$w};
716
717 if (defined $stopwords{$l}) {
718 $bestform{$l} = $stopwords{$l};
719
720 } elsif (defined $thesaurus{$l}) {
721 $bestform{$l} = $thesaurus{$l};
722
723 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
724 $bestfreq{$l} = $freq{$w};
725 $bestform{$l} = $w;
726 }
727 }
728
729 undef %freq;
730 undef %bestfreq;
731
732
733 # Assign symbol numbers to tokens
734 my $nextsymbol = 1;
735 my (@vocab);
736
737 # Delimiters
738 $first_delimiter = 1;
739
740 foreach $word (@delimiters) {
741
742 $word = lc($word);
743 $bestform{$word} = uc($word);
744 $vocab[$nextsymbol] = $word;
745 $symbol{$word} = $nextsymbol;
746 $nextsymbol++;
747 }
748 $last_delimiter = $nextsymbol - 1;
749
750 # Stopwords
751 $first_stopword = $nextsymbol;
752
753 foreach my $word (sort keys %stopwords) {
754
755 # don't incluse stopword unless it occurs in the text
756 $word = lc($word);
757 next unless ($totalfreq{$word});
758 next if ($symbol{$word});
759
760 $vocab[$nextsymbol] = $word;
761 $symbol{$word} = $nextsymbol;
762 $nextsymbol++;
763 }
764 $last_stopword = $nextsymbol - 1;
765 $first_contentword = $nextsymbol;
766
767 # Thesaurus terms
[1829]768 if ($thesaurus) {
[1643]769 $first_thesaurusword = $nextsymbol;
770
771 foreach my $word (sort keys %thesaurus) {
772
773 $word = lc($word);
774 next if ($symbol{$word});
775 $bestform{$word} = $thesaurus{$word};
776
777 $vocab[$nextsymbol] = $word;
778 $symbol{$word} = $nextsymbol;
779 $nextsymbol++;
780
781 }
782 $last_thesaurusword = $nextsymbol - 1;
783 }
784
785 # Other content words
786 $first_extractword = $nextsymbol;
787
788 foreach my $word (sort (keys %bestform)) {
789
790 next if ($symbol{$word});
791
792 $vocab[$nextsymbol] = $word;
793 $symbol{$word} = $nextsymbol;
794 $nextsymbol++;
795 }
796 $last_extractword = $nextsymbol - 1;
797 $last_contentword = $nextsymbol - 1;
798
799
800 # Outut the words
[1829]801 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
802 open(VOC, ">$phinddir/clauses.vocab");
[1643]803
804 for (my $i = 1; $i < $nextsymbol; $i++) {
805 $w = $vocab[$i];
806
807 print VOC "$bestform{$w}\n";
808 $totalfreq{$w} = 0 unless ($totalfreq{$w});
809 }
810 close VOC;
811
812
[1829]813 # Create statistics file
[1643]814 # Output statistics about the vocablary
[1829]815 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
816 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
[1643]817
[1829]818 open(STAT, ">$phinddir/clauses.stats")
819 || die "Cannot open $phinddir/clauses.stats: $!";
820
[1643]821 print STAT "first_delimiter $first_delimiter\n";
822 print STAT "last_delimiter $last_delimiter\n";
823 print STAT "first_stopword $first_stopword\n";
824 print STAT "last_stopword $last_stopword\n";
[1829]825 if ($thesaurus) {
[1643]826 print STAT "first_thesaurusword $first_thesaurusword\n";
827 print STAT "last_thesaurusword $last_thesaurusword\n";
828 }
829 print STAT "first_extractword $first_extractword\n";
830 print STAT "last_extractword $last_extractword\n";
831 print STAT "first_contentword $first_contentword\n";
832 print STAT "last_contentword $last_contentword\n";
833 print STAT "first_symbol $first_delimiter\n";
834 print STAT "last_symbol $last_contentword\n";
835 print STAT "first_word $first_stopword\n";
836 print STAT "last_word $last_contentword\n";
837 close STAT;
838
839 undef @vocab;
840
841
[1829]842 # Create numbers file
[1643]843 # Save text as symbol numbers
[1829]844 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
[1643]845
[1829]846 open(TXT, "<$phinddir/clauses");
847 open(NUM, ">$phinddir/clauses.numbers");
[1643]848
849 $phrasedelimiter = $symbol{lc($senlimit)};
850 print NUM "$symbol{lc($colstart)}\n";
851
852 # set up the special symbols that delimit documents and sentences
853 while(<TXT>) {
854
855 # split sentence into a list of tokens
856 $line = $_;
857 next unless ($line =~ /./);
858 @words = split(/\s+/, $line);
859
860 # output one token at a time
861 foreach $word (@words) {
862 $word = lc($word);
863 print NUM "$symbol{$word}\n";
864 }
865
866 # output phrase delimiter
867 print NUM "$phrasedelimiter\n";
868 }
869
[2333]870 close TXT;
[1643]871 print NUM "$symbol{lc($colend)}\n";
[1829]872 close NUM;
[1643]873
[1829]874 # Save thesaurus data in one convienient file
875 if ($thesaurus) {
876
877 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
878
879
880 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
881 if ($verbosity > 1);
882
883 # Read the thesaurus terms
884 my ($num, $text, %thes_symbols);
885
886 open(TH, "<$thesaurus_terms");
887 while(<TH>) {
888 chomp;
889 @words = split(/\s+/, $_);
890 $num = shift @words;
891 $text = "";
892
893 # translate words into symbol numbers
894 foreach $word (@words) {
895 $word = lc($word);
896 if ($symbol{$word}) {
897 $text .= "s$symbol{$word} ";
898 } elsif ($verbosity) {
899 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
900 }
901 }
902 $text =~ s/ $//;
903 $thes_symbols{$num} = $text;
904 }
905 close TH;
906
907 # Read the thesaurus links and write the corresponding data
908 open(TH, "<$thesaurus_links");
909 open(THOUT, ">$thesaurusfile");
910
911 while(<TH>) {
912 chomp;
913 ($num, $text) = split(/:/, $_);
914
915 if (defined($thes_symbols{$num})) {
916 print THOUT "$num:$thes_symbols{$num}:$text\n";
917 } else {
918 print THOUT "$num:untranslated:$text\n";
919 }
920 }
921 close TH;
922 close THOUT;
923 }
924
925
926
927
[1643]928}
929
930
[1829]931# renumber_phrases
932#
933# Prepare the phrases file to be input to mgpp. The biggest problem is
934# reconciling the phrase identifiers used by the suffix program (which
935# we'll call suffix-id numbers) with the numbers used in the thesaurus
936# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
937# Phind-id numbers must be sorted by frequency of occurance.
938#
939# Start creating a set of phind-id numbers from the sorted suffix-id
940# numbers and (if required) the thesaurus-id numbers. Then add any other
941# phrases occuring in the thesaurus.
942#
943# The last thing we have to do is restore the vocabulary information to the
944# phrase file so that the phrases are stored as words, not as symbol
945# numbers.
[1643]946
[1829]947# The original phrases file looks something like this:
948# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
949# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
950# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
951# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
[1643]952
953
954sub renumber_phrases {
[1829]955 my ($self) = @_;
[1643]956
[1829]957 renumber_suffix_data($self);
958 renumber_thesaurus_data($self);
959 restore_vocabulary_data($self);
[1808]960
[1829]961}
[1808]962
963
[1643]964
[1829]965# renumber_suffix_data
966#
967# Translate phrases file to phrases.2 using phind keys instead
968# of suffix keys and sorting the expansion data.
969
970sub renumber_suffix_data {
971 my ($self) = @_;
[1643]972
[1829]973 my $verbosity = $self->{'verbosity'};
[1839]974 my $out = $self->{'outhandle'};
[1829]975 print $out "Translate phrases: suffix-ids become phind-id's\n"
976 if ($verbosity);
977
978 my $phinddir = $self->{'phinddir'};
979 my $infile = &util::filename_cat($phinddir, 'phrases');
980 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
[1643]981
[1829]982 # Read the phrase file. Calculate initial set of phind-id
983 # numbers and store (suffixid -> frequency) relation.
[1643]984
[1829]985 my %suffixtophind;
986 my @phindfrequency;
987 my (@fields, $suffixid);
988 my $nextphind = 1;
[1643]989
[1829]990 open(IN, "<$infile");
[1643]991 while(<IN>) {
[1829]992
[1643]993 chomp;
[1829]994 @fields = split(/:/, $_);
[1643]995
[1829]996 # get next suffixid and phindid
997 $suffixid = shift @fields;
998 $suffixtophind{$suffixid} = $nextphind;
999
1000 # store total frequency
1001 shift @fields;
1002 $totalfrequency[$nextphind] = shift @fields;
1003
1004 $nextphind++;
[1643]1005 }
[1829]1006 close IN;
[1808]1007
[1829]1008
1009 # Translate phrases file to phrases.2. Use phind keys (not suffix
1010 # keys), sort expansion and document occurance data in order of
1011 # descending frequency..
1012 open(IN, "<$infile");
1013 open(OUT, ">$outfile");
[1643]1014
[1829]1015 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1016 my (@documwents, @newexp, $k, $n);
[1643]1017 my $linenumber = 0;
1018
1019 while(<IN>) {
1020
1021 # read the line
1022 chomp;
[1829]1023 @fields = split(/:/, $_);
[1643]1024
1025 # get a phrase number for this line
[1829]1026 $suffixid = shift @fields;
1027 die unless (defined($suffixtophind{$suffixid}));
1028 $phindid = $suffixtophind{$suffixid};
[1643]1029
[1829]1030 # get the symbols in the phrase
[1643]1031 $text = shift @fields;
1032
[1829]1033 # output status information
[1643]1034 $linenumber++;
[1829]1035 if ($verbosity > 2) {
1036 if ($linenumber % 1000 == 0) {
1037 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1038 }
1039 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[1643]1040 }
[1829]1041
[1643]1042 # get the phrase frequency
1043 $tf = shift @fields;
1044
1045 # get the number of expansions
1046 $countexp = shift @fields;
1047
[1829]1048 # get the expansions, convert them into phind-id numbers, and sort them
[1643]1049 $expansions = shift @fields;
1050 @newexp = ();
1051 foreach $k (split(/,/, $expansions)) {
[1829]1052 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1053 $n = $suffixtophind{$k};
[1643]1054 push @newexp, $n;
1055 }
[1829]1056 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
[1643]1057
1058 # get the number of documents
1059 $countdocs = shift @fields;
1060
[1829]1061 # get the documents and sort them
[1643]1062 $documents = shift @fields;
1063 $documents =~ s/d//g;
1064 @documents = split(/;/, $documents);
[1829]1065 @documents = sort by_doc_frequency @documents;
[1643]1066
1067 # output the phrase data
[1829]1068 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1069 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1070
1071 }
1072
1073 close IN;
1074 close OUT;
1075}
1076
1077
1078# renumber_thesaurus_data
1079#
1080# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1081
1082sub renumber_thesaurus_data {
1083 my ($self) = @_;
1084
[1839]1085 my $out = $self->{'outhandle'};
[1829]1086 my $verbosity = $self->{'verbosity'};
1087 my $thesaurus = $self->{'thesaurus'};
1088
1089 my $phinddir = $self->{'phinddir'};
1090 my $infile = &util::filename_cat($phinddir, "phrases.2");
1091 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1092
1093
1094 # If no thesaurus is defined, simply move the phrases file.
1095 if (!$thesaurus) {
1096 print $out "Translate phrases.2: no thesaurus data\n"
1097 if ($verbosity);
1098 &util::mv($infile, $outfile);
1099 return;
1100 }
1101
1102 print $out "Translate phrases.2: add thesaurus data\n"
1103 if ($verbosity);
1104
1105 # 1.
1106 # Read thesaurus file and store (symbols->thesaurusid) mapping
1107 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1108 my %symbolstothesid;
1109 my (@fields, $thesid, $symbols);
1110
1111 open(TH, "<$thesaurusfile");
1112
1113 while (<TH>) {
1114
1115 chomp;
1116 @fields = split(/:/, $_);
1117
1118 # get id and text
1119 $thesid = shift @fields;
1120 $symbols = shift @fields;
1121 $symbolstothesid{$symbols} = $thesid;
1122 }
1123 close TH;
1124
1125 # 2.
[1839]1126 # Read phrases file to find thesaurus entries that already
[2064]1127 # have a phindid. Store their phind-ids for later translation,
1128 # and store their frequency for later sorting.
[1829]1129 my %thesaurustophindid;
[2064]1130 my %phindidtofrequency;
1131 my ($phindid, $freq);
[1829]1132
1133 open(IN, "<$infile");
1134
1135 while(<IN>) {
1136
1137 chomp;
1138 @fields = split(/:/, $_);
1139
1140 # phindid and symbols for this line
1141 $phindid = shift @fields;
1142 $symbols = shift @fields;
[2064]1143 $freq = shift @fields;
[1829]1144
1145 # do we have a thesaurus id corresponding to this phrase?
1146 if (defined($symbolstothesid{$symbols})) {
1147 $thesid = $symbolstothesid{$symbols};
1148 $thesaurustophindid{$thesid} = $phindid;
[2064]1149 $phindidtofrequency{$phindid} = $freq;
[1829]1150 }
1151 }
1152 close IN;
1153
1154 undef %symbolstothesid;
1155
1156 # 3.
[2064]1157 # Create phind-id numbers for remaining thesaurus entries,
1158 # and note that their frequency is 0 for later sorting.
[1829]1159 my $nextphindid = $phindid + 1;
1160
1161 open(TH, "<$thesaurusfile");
1162 while(<TH>) {
1163
1164 chomp;
1165 @fields = split(/:/, $_);
1166
1167 # read thesaurus-id and ensure it has a corresponding phind-id
1168 $thesid = shift @fields;
1169 if (!defined($thesaurustophindid{$thesid})) {
1170 $thesaurustophindid{$thesid} = $nextphindid;
[2064]1171 $phindidtofrequency{$nextphindid} = 0;
[1829]1172 $nextphindid++;
1173 }
1174 }
1175 close TH;
1176
1177 # 4.
1178 # Translate thesaurus file, replacing thesaurus-id numbers with
1179 # phind-id numbers.
1180 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
[2064]1181 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1182 my (@links, $linkid, %linkidtotype, $newrelation);
[1829]1183
1184 open(TH, "<$thesaurusfile");
1185 open(TO, ">$newthesaurusfile");
1186 while(<TH>) {
1187
1188 chomp;
1189 @fields = split(/:/, $_);
1190
1191 # phindid and symbols for this line
1192 ($thesid, $symbols, $relations) = @fields;
1193
1194 die unless ($thesid && $symbols);
1195 die unless $thesaurustophindid{$thesid};
1196 $phindid = $thesaurustophindid{$thesid};
1197
1198 # convert each part of the relation string to use phind-id numbers
[2064]1199 # at the same time, we want to sort the list by frequency.
1200 undef %linkidtotype;
1201
[1829]1202 foreach $linktext (split(/;/, $relations)) {
1203 @linkdata = split(/,/, $linktext);
[2064]1204
[1829]1205 # remember the linktype (e.g. BT, NT)
1206 $linktype = shift @linkdata;
[2064]1207
1208 # store the type of each link
1209 foreach $thesid (@linkdata) {
1210 die unless (defined($thesaurustophindid{$thesid}));
1211 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1212 }
1213 }
[1829]1214
[2064]1215 # sort the list of links, first by frequency, then by type.
1216 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1217 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1218 $linkcounter = (scalar @links);
1219
1220 # create a string describing the link information
1221 $linktype = $linkidtotype{$links[0]};
1222 $newrelation = $linktype;
1223 foreach $linkid (@links) {
1224 if ($linkidtotype{$linkid} ne $linktype) {
1225 $linktype = $linkidtotype{$linkid};
1226 $newrelation .= ";" . $linktype;
[1829]1227 }
[2064]1228 $newrelation .= "," . $linkid;
[1829]1229 }
[2064]1230 $newrelation .= ";";
1231
[1829]1232
[2064]1233 # output the new line
1234 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
[1829]1235 }
1236 close TH;
1237 close TO;
1238
1239 undef %thesaurustophindid;
[2064]1240 undef %linkidtotype;
1241 undef %phindidtofrequency;
[1829]1242
1243 # 5.
1244 # Read thesaurus data (in phind-id format) into memory
1245 my %thesaurusdata;
1246
1247 open(TH, "<$newthesaurusfile");
1248 while(<TH>) {
1249 chomp;
1250 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1251 die unless ($phindid && $symbols);
1252 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1253 }
[2487]1254 close TH;
[1829]1255
1256 # 6.
1257 # Add thesaurus data to phrases file
1258 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1259 my (@documwents, @newexp, $k, $n);
1260 my $linenumber = 0;
1261
1262 open(IN, "<$infile");
1263 open(OUT, ">$outfile");
1264
1265 # Update existing phrases
1266 while(<IN>) {
1267
1268 chomp;
1269 @fields = split(/:/, $_);
1270
1271 # get data for this line
1272 $phindid = shift @fields;
1273
1274 # output the phrase data, with thesaurus information
1275 print OUT "$phindid:", join(":", @fields);
1276
1277 # add thesaurus data
1278 if (defined($thesaurusdata{$phindid})) {
1279 @fields = split(/:/, $thesaurusdata{$phindid});
1280 shift @fields;
1281 $linkcounter = shift @fields;
1282 $relations = shift @fields;
1283
1284 print OUT ":$linkcounter:$relations";
1285 $thesaurusdata{$phindid} = "";
1286 }
1287 print OUT "\n";
1288 }
1289 close IN;
1290
1291 # Add phrases that aren't already in the file
1292 foreach $phindid (sort numerically keys %thesaurusdata) {
1293 next unless ($thesaurusdata{$phindid});
1294
1295 @fields = split(/:/, $thesaurusdata{$phindid});
1296 $symbols = shift @fields;
1297 $linkcounter = shift @fields;
1298 $relations = shift @fields;
1299
1300 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1301 }
1302 close OUT;
1303
1304}
1305
1306# restore_vocabulary_data
1307#
1308# Read phrases.3 and restore vocabulary information. Then write
1309# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1310# (if requested) to the saved phrases file.
1311
1312sub restore_vocabulary_data {
1313 my ($self) = @_;
1314
[1839]1315 my $out = $self->{'outhandle'};
[1829]1316 my $verbosity = $self->{'verbosity'};
1317 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1318
1319 my $phinddir = $self->{'phinddir'};
1320 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1321 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1322 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1323 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1324
1325 my $savephrases = $self->{'savephrases'};
1326
1327 # 1.
1328 # Read the vocabulary file
1329 open(V, "<$vocabfile")
1330 || die "Cannot open $vocabfile: $!";
1331 my @symbol;
1332 my $i = 1;
1333 while(<V>) {
1334 chomp;
1335 $symbol[$i++] = $_;
1336 }
[2487]1337 close V;
[1829]1338
1339 # 2.
1340 # Translate phrases.3 to MGPP input files
[2079]1341 my ($key, $text, $word, $isThesaurus);
[1829]1342 my @fields;
1343 my $linenumber = 0;
1344
1345 open(IN, "<$infile");
1346 open(DATA, ">$datafile");
1347 open(WORD, ">$wordfile");
1348
1349 # Save the phrases in a separate text file
1350 if ($savephrases) {
1351 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1352 open(SAVE, ">$savephrases");
1353 }
1354
1355 while(<IN>) {
1356
1357 # read the line
1358 chomp;
1359 $line = $_;
1360 @fields = split(/:/, $line);
1361
1362 # get a phrase number for this line
1363 $key = shift @fields;
1364
1365 # restore the text of the phrase
1366 $text = shift @fields;
1367 $text =~ s/s(\d+)/$symbol[$1]/g;
1368 if ($text =~ / /) {
1369 $word = "";
1370 } elsif ($text ne 'untranslated') {
1371 $word = $text;
1372 }
1373
1374 # output the phrase data
[1643]1375 print DATA "<Document>";
[1829]1376 print DATA "$key:$text:", join(":", @fields), ":\n";
[1643]1377
1378 # output the word index search data
[1829]1379 print WORD "<Document>$word\n";
[1643]1380
[1808]1381 # output the phrases to a text file
1382 if ($savephrases) {
[2079]1383 if ((scalar @fields) == 7) {
1384 $isThesaurus = 1;
1385 } else {
1386 $isThesaurus = 0;
1387 }
1388 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
[1808]1389 }
[1643]1390 }
[1829]1391 close IN;
1392 close WORD;
1393 close DATA;
1394 close SAVE if ($savephrases);
[1808]1395
[1643]1396}
1397
[1829]1398
1399
[1643]1400# sort routines used to renumber phrases
1401
1402sub numerically { $a <=> $b }
1403
[1829]1404sub by_doc_frequency {
[1643]1405 my $fa = 1;
1406 if ($a =~ /,/) {
1407 $fa = $a;
1408 $fa =~ s/\d+,//;
1409 }
1410 my $fb = 1;
1411 if ($b =~ /,/) {
1412 $fb = $b;
1413 $fb =~ s/\d+,//;
1414 }
1415
1416 return ($fb <=> $fa);
1417}
1418
14191;
Note: See TracBrowser for help on using the repository browser.