Context Navigation

source: gsdl/trunk/perllib/classify/Phind.pm@ 20454

Last change on this file since 20454 was 20454, checked in by kjdon, 15 years ago
strip ex from metadata type args
Property svn:keywords set to `Author Date Id Revision`
File size: 40.5 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BaseClassifier;
37	use util;
38	use ghtml;
39	use unicode;
40
41	use strict;
42	no strict 'refs'; # allow filehandles to be variables and viceversa
43
44	my @removedirs = ();
45
46	my %wanted_index_files = ('td'=>1,
47	't'=>1,
48	'ti'=>1,
49	'tl'=>1,
50	'tsd'=>1,
51	'idb'=>1,
52	'ib1'=>1,
53	'ib2'=>1,
54	'ib3'=>1,
55	'i'=>1,
56	'il'=>1,
57	'w'=>1,
58	'wa'=>1);
59
60	sub BEGIN {
61	@Phind::ISA = ('BaseClassifier');
62	}
63
64	sub END {
65
66	# Tidy up stray files - we do this here as there's some weird problem
67	# preventing us from doing it in the get_classify_info() function (on
68	# windows at least) where the close() appears to fail on txthandle and
69	# dochandle, thus preventing us from deleting those files
70
71	foreach my $dir (@removedirs) {
72	if (-d $dir && opendir (DIR, $dir)) {
73	my @files = readdir DIR;
74	closedir DIR;
75
76	foreach my $file (@files) {
77	next if $file =~ /^\.\.?$/;
78	my ($suffix) = $file =~ /\.([^\.]+)$/;
79	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
80	# delete it!
81	&util::rm (&util::filename_cat ($dir, $file));
82	}
83	}
84	}
85	}
86	}
87
88	my $arguments =
89	[ { 'name' => "text",
90	'desc' => "{Phind.text}",
91	'type' => "string",
92	'deft' => "section:Title,section:text",
93	'reqd' => "no" },
94	{ 'name' => "title",
95	'desc' => "{Phind.title}",
96	'type' => "metadata",
97	'deft' => "Title",
98	'reqd' => "no" },
99	{ 'name' => "buttonname",
100	'desc' => "{BasClas.buttonname}",
101	'type' => "string",
102	'deft' => "Phrase",
103	'reqd' => "no" },
104	{ 'name' => "language",
105	'desc' => "{Phind.language}",
106	'type' => "string",
107	'deft' => "en",
108	'reqd' => "no" },
109	{ 'name' => "savephrases",
110	'desc' => "{Phind.savephrases}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no" },
114	{ 'name' => "suffixmode",
115	'desc' => "{Phind.suffixmode}",
116	'type' => "int",
117	'deft' => "1",
118	'range' => "0,1",
119	'reqd' => "no" },
120	{ 'name' => "min_occurs",
121	'desc' => "{Phind.min_occurs}",
122	'type' => "int",
123	'deft' => "2",
124	'range' => "1,",
125	'reqd' => "no" },
126	{ 'name' => "thesaurus",
127	'desc' => "{Phind.thesaurus}",
128	'type' => "string",
129	'deft' => "",
130	'reqd' => "no" },
131	{ 'name' => "untidy",
132	'desc' => "{Phind.untidy}",
133	'type' => "flag",
134	'reqd' => "no" } ];
135
136	my $options = { 'name' => "Phind",
137	'desc' => "{Phind.desc}",
138	'abstract' => "no",
139	'inherits' => "yes",
140	'args' => $arguments };
141
142
143	# Phrase delimiter symbols - these should be abstracted out someplace
144
145	my $colstart = "COLLECTIONSTART";
146	my $colend = "COLLECTIONEND";
147	my $doclimit = "DOCUMENTLIMIT";
148	my $senlimit = "SENTENCELIMIT";
149	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
150
151
152	# Create a new Phind browser based on collect.cfg
153
154	sub new {
155	my ($class) = shift (@_);
156	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
157	push(@$classifierslist, $class);
158
159	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
160	push(@{$hashArgOptLists->{"OptList"}},$options);
161
162	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
163
164	if ($self->{'info_only'}) {
165	# don't worry about any options etc
166	return bless $self, $class;
167	}
168
169	# Ensure the Phind generate scripts are in place
170	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
171	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
172	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
173	if (!(-e $file1)) {
174	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
175	exit(1);
176	}
177
178	# things that may have ex. in them that need to be stripped off
179	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
180	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
181
182	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
183
184	$self->{"indexes"} = $self->{"text"};
185
186	# Further setup
187	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
188	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
189	if (! defined $self->{'builddir'}) {
190	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
191	}
192	$self->{'total'} = 0;
193
194	# Clean out the unused keys
195	delete $self->{"text"};
196
197	return bless $self, $class;
198	}
199
200
201	# Initialise the Phind classifier
202
203	sub init {
204	my $self = shift (@_);
205
206	# ensure we have a build directory
207	my $builddir = $self->{'builddir'};
208	die unless (-e "$builddir");
209
210	# create Phind directory
211	my $phnumber = 1;
212	my $phinddir = &util::filename_cat($builddir, "phind1");
213	while (-e "$phinddir") {
214	$phnumber++;
215	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
216	}
217	&util::mk_dir("$phinddir");
218	$self->{'phinddir'} = $phinddir;
219	$self->{'phindnumber'} = $phnumber;
220
221	push(@removedirs, $phinddir) unless $self->{'untidy'};
222
223	# open filehandles for documents and text
224	my $clausefile = &util::filename_cat("$phinddir", "clauses");
225	&util::rm($clausefile) if (-e $clausefile);
226
227	my $txthandle = 'TEXT' . $phnumber;
228	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
229	$self->{'txthandle'} = $txthandle;
230
231	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
232	&util::rm($docfile) if (-e $docfile);
233
234	my $dochandle = 'DOC' . $phnumber;
235	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
236	$self->{'dochandle'} = $dochandle;
237
238	}
239
240
241	# Classify each document.
242	#
243	# Each document is passed here in turn. The classifier extracts the
244	# text of each and stores it in the clauses file. Document details are
245	# stored in the docs.txt file.
246
247	sub classify {
248	my $self = shift (@_);
249	my ($doc_obj,$edit_mode) = @_;
250
251	my $verbosity = $self->{'verbosity'};
252	my $top_section = $doc_obj->get_top_section();
253
254	my $titlefield = $self->{'title'};
255
256	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
257	if (!defined($title)) {
258	$title = "";
259	print STDERR "Phind: document has no title\n";
260	}
261	print "process: $title\n" if ($verbosity > 2);
262
263	# Only consider the file if it is in the correct language
264	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
265	my $phrlanguage = $self->{'language'};
266	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
267
268	if ($edit_mode eq "delete") {
269	# This classifier works quite differently to the others
270	# Probably doesn't support incremental building anyway
271	return;
272	}
273
274	# record this file
275	$self->{'total'} ++;
276	# what is $file ???
277	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
278
279	# Store document details
280	my $OID = $doc_obj->get_OID();
281	$OID = "NULL" unless defined $OID;
282	my $dochandle = $self->{'dochandle'};
283	print $dochandle "<Document>\t$OID\t$title\n";
284
285	# Store the text occuring in this object
286
287	# output the document delimiter
288	my $txthandle = $self->{'txthandle'};
289	print $txthandle "$doclimit\n";
290
291	# iterate over the required indexes and store their text
292	my $indexes = $self->{'indexes'};
293	my $text = "";
294	my ($part, $level, $field, $section, $data, $dataref);
295
296	foreach $part (split(/,/, $indexes)) {
297
298	# Each field has a level and a data element ((e.g. document:Title)
299	($level, $field) = split(/:/, $part);
300	die unless ($level && $field);
301
302	# Extract the text from every section
303	# (In phind, document:text and section:text are equivalent)
304	if ($field eq "text") {
305	$data = "";
306	$section = $doc_obj->get_top_section();
307	while (defined($section)) {
308	$data .= $doc_obj->get_text($section) . "\n";
309	$section = $doc_obj->get_next_section($section);
310	}
311	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
312	}
313
314	# Extract a metadata field from a document
315	# (If there is more than one element of the given type, get them all.)
316	elsif ($level eq "document") {
317	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
318	foreach $data (@$dataref) {
319	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
320	}
321	}
322
323	# Extract metadata from every section in a document
324	elsif ($level eq "section") {
325	$data = "";
326	$section = $doc_obj->get_top_section();
327	while (defined($section)) {
328	$dataref = $doc_obj->get_metadata($section, $field);
329	$data .= join("\n", @$dataref) . "\n";
330	$section = $doc_obj->get_next_section($section);
331	}
332	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
333	}
334
335	# Some sort of specification which I don't understand
336	else {
337	die "Unknown level ($level) in Phind index ($part)\n";
338	}
339
340	}
341
342	# output the text
343	$text =~ tr/\n//s;
344	print $txthandle "$text";
345	}
346
347
348	# Construct the classifier from the information already gathered
349	#
350	# When get_classify_info is called, the clauses and docs.txt files have
351	# already been constructed in the Phind directory. This function will
352	# translate them into compressed, indexed MGPP files that can be read by
353	# the phindcgi script. It will also register our classifier so that it
354	# shows up in the navigation bar.
355
356	sub get_classify_info {
357	my $self = shift (@_);
358	my ($gli) = @_;
359
360	close $self->{'dochandle'};
361	close $self->{'txthandle'};
362	my $verbosity = $self->{'verbosity'};
363	my $out = $self->{'outhandle'};
364	my $phinddir = $self->{'phinddir'};
365
366	my $osextra = "";
367	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
368	$osextra = " -d /";
369	}
370
371	print STDERR "</Stage>\n" if $gli;
372
373	if ($verbosity) {
374	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
375	print $out "*** in ", $self->{'phinddir'}, "\n";
376	}
377
378	print STDERR "<Stage name='Phind'>\n" if $gli;
379
380	# Construct phind indexes
381	my $suffixmode = $self->{'suffixmode'};
382	my $min_occurs = $self->{'min_occurs'};
383	my ($command, $status);
384
385	# Generate the vocabulary, symbol statistics, and numbers file
386	# from the clauses file
387	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
388	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
389	&extract_vocabulary($self);
390
391	# Use the suffix program to generate the phind/phrases file
392	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
393	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
394	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
395
396	# check that we generated some files. It's not necessarily an error if
397	# we didn't (execute() would have quit on error), but we can't go on.
398	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
399	if (! -r $phrasesfile) {
400	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
401	print $out "\nNo phrases found for Phind classifier!\n";
402	return;
403	}
404
405	# Create the phrase file and put phrase numbers in phind/phrases
406	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
407	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
408	&renumber_phrases($self);
409
410	print $out "\nCreating phrase databases\n";
411	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
412	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
413	my $mg_stem = &util::filename_cat($phinddir, "pdata");
414
415	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
416	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
417	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
418
419	# create the mg index of words
420	print $out "\nCreating word-level search indexes\n";
421	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
422	$mg_input = &util::filename_cat($phinddir, "pword.txt");
423	$mg_stem = &util::filename_cat($phinddir, "pword");
424
425	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
426	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
427	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
428	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
429	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
430	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
431
432	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
433	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
434	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
435
436	# create the mg document information database
437	print $out "\nCreating document information databases\n";
438	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
439	$mg_input = &util::filename_cat($phinddir, "docs.txt");
440	$mg_stem = &util::filename_cat($phinddir, "docs");
441
442	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
443	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
444	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
445
446	# Return the information about the classifier that we'll later want to
447	# use to create macros when the Phind classifier document is displayed.
448	my %classifyinfo = ('thistype'=>'Invisible',
449	'childtype'=>'Phind',
450	'Title'=>$self->{'buttonname'},
451	'parameters'=>"phindnumber=$self->{'phindnumber'}",
452	'contains'=>[]);
453
454	my $collection = $self->{'collection'};
455	my $url = "library?a=p&p=phind&c=$collection";
456	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
457
458	return \%classifyinfo;
459	}
460
461
462
463	sub convert_gml_to_tokens {
464
465	my ($language_exp, $text) = @_;
466
467	# escape any magic words... - jrm21
468	foreach my $delim (@delimiters) {
469	my $replacement=lc($delim);
470	my $num= $text=~ s/$delim/$replacement/g;
471	if (!$num) {$num=0;}
472	}
473
474	if ($language_exp =~ /en/) {
475	return &convert_gml_to_tokens_EN($text);
476	}
477
478	if ($language_exp =~ /zh/) {
479	return &convert_gml_to_tokens_ZH($text);
480	}
481
482	$_ = $text;
483
484	# 1. remove GML tags
485
486	# Remove everything that is in a tag
487	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
488	s/\s<br>\s/ LINEBREAK /isgo;
489	s/<[^>]*>/ /sgo;
490
491	# Now we have the text, but it may contain HTML
492	# elements coded as > etc. Remove these tags.
493	s/&/&/sgo;
494	s/</</sgo;
495	s/>/>/sgo;
496	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
497	s/\s<br>\s/ LINEBREAK /isgo;
498	s/<[^>]*>/ /sgo;
499
500	# replace<p> and <br> placeholders with clause break symbol (\n)
501	s/\s+/ /gso;
502	s/PARAGRAPHBREAK/\n/sgo;
503	s/LINEBREAK/\n/sgo;
504
505
506
507
508	# 2. Split the remaining text into space-delimited tokens
509
510	# Convert any HTML special characters (like ") to their UTF8 equivalent
511	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
512
513	# Split text at word boundaries
514	s/\b/ /go;
515
516	# 3. Convert the remaining text to "clause format"
517
518	# Insert newline if the end of a sentence is detected
519	# (delimter is: "[\.\?\!]\s")
520	# s/\s*[\.\?\!]\s+/\n/go;
521
522	# remove unnecessary punctuation and replace with clause break symbol (\n)
523	s/[^\w ]/\n/go;
524
525	# remove extraneous whitespace
526	s/ +/ /sgo;
527	s/^\s+//mgo;
528	s/\s*$/\n/mgo;
529
530	# remove lines that contain one word or less
531	s/^\S*$//mgo;
532	s/^\s*$//mgo;
533	tr/\n//s;
534
535	return $_;
536	}
537
538	# a chinese version
539	sub convert_gml_to_tokens_ZH {
540
541	$_ = shift @_;
542
543	# Replace all whitespace with a simple space
544	s/\s+/ /gs;
545	# Remove everything that is in a tag
546	s/\s<p>\s/ PARAGRAPHBREAK /isg;
547	s/\s<br>\s/ LINEBREAK /isg;
548	s/<[^>]*>/ /sg;
549
550	# Now we have the text, but it may contain HTML
551	# elements coded as > etc. Remove these tags.
552	s/</</sg;
553	s/>/>/sg;
554
555	s/\s+/ /sg;
556	s/\s<p>\s/ PARAGRAPHBREAK /isg;
557	s/\s<br>\s/ LINEBREAK /isg;
558	s/<[^>]*>/ /sg;
559
560	# remove & and other miscellaneous markup tags
561	s/&/&/sg;
562	s/</</sg;
563	s/>/>/sg;
564	s/&/&/sg;
565
566	# replace<p> and <br> placeholders with carriage returns
567	s/PARAGRAPHBREAK/\n/sg;
568	s/LINEBREAK/\n/sg;
569
570
571	# print STDERR "text:$_\n";
572	return $_;
573	}
574
575	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
576
577	sub convert_gml_to_tokens_EN {
578	$_ = shift @_;
579
580	# FIRST, remove GML tags
581
582	# Replace all whitespace with a simple space
583	s/\s+/ /gs;
584
585	# Remove everything that is in a tag
586	s/\s<p>\s/ PARAGRAPHBREAK /isg;
587	s/\s<br>\s/ LINEBREAK /isg;
588	s/<[^>]*>/ /sg;
589
590	# Now we have the text, but it may contain HTML
591	# elements coded as > etc. Remove these tags.
592	s/</</sg;
593	s/>/>/sg;
594
595	s/\s+/ /sg;
596	s/\s<p>\s/ PARAGRAPHBREAK /isg;
597	s/\s<br>\s/ LINEBREAK /isg;
598	s/<[^>]*>/ /sg;
599
600	# remove & and other miscellaneous markup tags
601	s/&/&/sg;
602	s/</</sg;
603	s/>/>/sg;
604	s/&/&/sg;
605
606	# replace<p> and <br> placeholders with carriage returns
607	s/PARAGRAPHBREAK/\n/sg;
608	s/LINEBREAK/\n/sg;
609
610
611	# Exceptional punctuation
612	#
613	# We make special cases of some punctuation
614
615	# remove any apostrophe that indicates omitted letters
616	s/(\w+)\'(\w*\s)/ $1$2 /g;
617
618	# remove period that appears in a person's initals
619	s/\s([A-Z])\./ $1 /g;
620
621	# replace hyphens in hypheanted words and names with a space
622	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
623
624	# Convert the remaining text to "clause format",
625	# This means removing all excess punctuation and garbage text,
626	# normalising valid punctuation to fullstops and commas,
627	# then putting one cluse on each line.
628
629	# Insert newline when the end of a sentence is detected
630	# (delimter is: "[\.\?\!]\s")
631	s/\s*[\.\?\!]\s+/\n/g;
632
633	# split numbers after four digits
634	s/(\d\d\d\d)/$1 /g;
635
636	# split words after 32 characters
637
638	# squash repeated punctuation
639	tr/A-Za-z0-9 //cs;
640
641	# save email addresses
642	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
643
644	# normalise clause breaks (mostly punctuation symbols) to commas
645	s/[^A-Za-z0-9 \n]+/ , /g;
646
647	# Remove repeated commas, and replace with newline
648	s/\s*,[, ]+/\n/g;
649
650	# remove extra whitespace
651	s/ +/ /sg;
652	s/^\s+//mg;
653	s/\s*$/\n/mg;
654
655	# remove lines that contain one word or less
656	s/^\w*$//mg;
657	s/^\s*$//mg;
658	tr/\n//s;
659
660	return $_;
661
662	}
663
664
665
666	# Execute a system command
667
668	sub execute {
669	my ($command, $verbosity, $outhandle) = @_;
670	print $outhandle "Executing: $command\n" if ($verbosity > 2);
671	$! = 0;
672	my $status = system($command);
673	if ($status != 0) {
674	print STDERR "Phind - Error executing '$command': $!\n";
675	exit($status); # this causes the build to fail...
676	}
677	}
678
679
680	# Generate the vocabulary, symbol statistics, and numbers file from the
681	# clauses file. This is legacy code, so is a bit messy and probably wont
682	# run under windows.
683
684	sub extract_vocabulary {
685	my ($self) = @_;
686
687	my $verbosity = $self->{'verbosity'};
688	my $out = $self->{'outhandle'};
689
690	my $collectiondir = $self->{'collectiondir'};
691	my $phinddir = $self->{'phinddir'};
692
693	my $language_exp = $self->{'language'};
694
695	my ($w, $l, $line, $word);
696
697	my ($first_delimiter, $last_delimiter,
698	$first_stopword, $last_stopword,
699	$first_extractword, $last_extractword,
700	$first_contentword, $last_contentword,
701	$phrasedelimiter);
702
703	my $thesaurus = $self->{'thesaurus'};
704	my ($thesaurus_links, $thesaurus_terms,
705	%thesaurus, $first_thesaurusword, $last_thesaurusword);
706
707	my %symbol;
708	my (%freq);
709
710	print $out "Calculating vocabulary\n" if ($verbosity > 1);
711
712	# Read and store the stopwords
713	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
714	my $stopword_files = ();
715	my ($language, $language_dir, $file, $file_name);
716	my %stopwords;
717
718	# Examine each directory in the stopword directory
719	opendir(STOPDIR, $stopdir);
720	foreach $language (readdir STOPDIR) {
721
722	# Ignore entries that do not match the classifier's language
723	next unless ($language =~ /$language_exp/);
724	$language_dir = &util::filename_cat($stopdir, $language);
725	next unless (-d "$language_dir");
726
727	opendir(LANGDIR, $language_dir);
728	foreach $file (readdir LANGDIR) {
729
730	# Ignore entries that are not stopword files
731	next unless ($file =~ /sw$/);
732	$file_name = &util::filename_cat($language_dir, $file);
733	next unless (-f "$file_name");
734
735	# Read the stopwords
736	open(STOPFILE, "<$file_name");
737	while (<STOPFILE>) {
738	s/^\s+//;
739	s/\s.*//;
740	$word = $_;
741	$l = lc($word);
742	$stopwords{$l} = $word;
743	}
744	close STOPFILE;
745
746	}
747	closedir LANGDIR;
748	}
749	closedir STOPDIR;
750
751	# Read thesaurus information
752	if ($thesaurus) {
753
754	# link file exists
755	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
756	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
757
758	# ensure term file exists in the correct language
759	if ($language_exp =~ /^([a-z][a-z])/) {
760	$language = $1;
761	} else {
762	$language = 'en';
763	}
764	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
765	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
766
767
768	# Read the thesaurus terms
769	open(TH, "<$thesaurus_terms");
770	while(<TH>) {
771	s/^\d+ //;
772	s/\(.*\)//;
773	foreach $w (split(/\s+/, $_)) {
774	$thesaurus{lc($w)} = $w;
775	}
776	}
777	close TH;
778	}
779
780	# Read words in the text and count occurences
781	open(TXT, "<$phinddir/clauses");
782
783	my @words;
784	while(<TXT>) {
785	$line = $_;
786	next unless ($line =~ /./);
787
788	@words = split(/\s+/, $line);
789	foreach $w (@words) {
790	$l = lc($w);
791	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
792	$freq{$w}++;
793	}
794	$freq{$senlimit}++;
795	}
796
797	close TXT;
798
799	# Calculate the "best" form of each word
800	my (%bestform, %totalfreq, %bestfreq);
801
802	foreach $w (sort (keys %freq)) {
803	$l = lc($w);
804
805	# totalfreq is the number of times a term appears in any form
806	$totalfreq{$l} += $freq{$w};
807
808	if (defined $stopwords{$l}) {
809	$bestform{$l} = $stopwords{$l};
810
811	} elsif (defined $thesaurus{$l}) {
812	$bestform{$l} = $thesaurus{$l};
813
814	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
815	$bestfreq{$l} = $freq{$w};
816	$bestform{$l} = $w;
817	}
818	}
819	undef %freq;
820	undef %bestfreq;
821
822
823	# Assign symbol numbers to tokens
824	my $nextsymbol = 1;
825	my (@vocab);
826
827	# Delimiters
828	$first_delimiter = 1;
829
830	foreach $word (@delimiters) {
831
832	# $word = lc($word); # jrm21
833	$word = uc($word);
834	$bestform{$word} = $word;
835	$vocab[$nextsymbol] = $word;
836	$symbol{$word} = $nextsymbol;
837	$nextsymbol++;
838	}
839	$last_delimiter = $nextsymbol - 1;
840	# Stopwords
841	$first_stopword = $nextsymbol;
842
843	foreach my $word (sort keys %stopwords) {
844	# don't include stopword unless it occurs in the text
845	$word = lc($word);
846	next unless ($totalfreq{$word});
847	next if ($symbol{$word});
848
849	$vocab[$nextsymbol] = $word;
850	$symbol{$word} = $nextsymbol;
851	$nextsymbol++;
852	}
853	$last_stopword = $nextsymbol - 1;
854	$first_contentword = $nextsymbol;
855
856	# Thesaurus terms
857	if ($thesaurus) {
858	$first_thesaurusword = $nextsymbol;
859
860	foreach my $word (sort keys %thesaurus) {
861
862	$word = lc($word);
863	next if ($symbol{$word});
864	$bestform{$word} = $thesaurus{$word};
865
866	$vocab[$nextsymbol] = $word;
867	$symbol{$word} = $nextsymbol;
868	$nextsymbol++;
869
870	}
871	$last_thesaurusword = $nextsymbol - 1;
872	}
873
874	# Other content words
875	$first_extractword = $nextsymbol;
876
877	foreach my $word (sort (keys %bestform)) {
878
879	next if ($symbol{$word});
880
881	$vocab[$nextsymbol] = $word;
882	$symbol{$word} = $nextsymbol;
883	$nextsymbol++;
884	}
885	$last_extractword = $nextsymbol - 1;
886	$last_contentword = $nextsymbol - 1;
887
888	# Outut the words
889	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
890	open(VOC, ">$phinddir/clauses.vocab");
891
892	for (my $i = 1; $i < $nextsymbol; $i++) {
893	$w = $vocab[$i];
894
895	print VOC "$bestform{$w}\n";
896	$totalfreq{$w} = 0 unless ($totalfreq{$w});
897	}
898	close VOC;
899
900
901	# Create statistics file
902	# Output statistics about the vocablary
903	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
904	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
905
906	open(STAT, ">$phinddir/clauses.stats")
907	\|\| die "Cannot open $phinddir/clauses.stats: $!";
908
909	print STAT "first_delimiter $first_delimiter\n";
910	print STAT "last_delimiter $last_delimiter\n";
911	print STAT "first_stopword $first_stopword\n";
912	print STAT "last_stopword $last_stopword\n";
913	if ($thesaurus) {
914	print STAT "first_thesaurusword $first_thesaurusword\n";
915	print STAT "last_thesaurusword $last_thesaurusword\n";
916	}
917	print STAT "first_extractword $first_extractword\n";
918	print STAT "last_extractword $last_extractword\n";
919	print STAT "first_contentword $first_contentword\n";
920	print STAT "last_contentword $last_contentword\n";
921	print STAT "first_symbol $first_delimiter\n";
922	print STAT "last_symbol $last_contentword\n";
923	print STAT "first_word $first_stopword\n";
924	print STAT "last_word $last_contentword\n";
925	close STAT;
926
927	undef @vocab;
928
929
930	# Create numbers file
931	# Save text as symbol numbers
932	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
933
934	open(TXT, "<$phinddir/clauses");
935	open(NUM, ">$phinddir/clauses.numbers");
936
937	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
938	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
939	$phrasedelimiter = $symbol{$senlimit};
940	print NUM "$symbol{$colstart}\n";
941
942	# set up the special symbols that delimit documents and sentences
943	while(<TXT>) {
944
945	# split sentence into a list of tokens
946	$line = $_;
947	next unless ($line =~ /./);
948	@words = split(/\s+/, $line);
949
950	# output one token at a time
951	foreach $word (@words) {
952	# don't lower-case special delimiters - jrm21
953	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
954	$word = lc($word);
955	}
956	print NUM "$symbol{$word}\n";
957	}
958
959	# output phrase delimiter
960	print NUM "$phrasedelimiter\n";
961	}
962
963	close TXT;
964	# print NUM "$symbol{lc($colend)}\n";# jrm21
965	print NUM "$symbol{$colend}\n";
966	close NUM;
967
968	# Save thesaurus data in one convienient file
969	if ($thesaurus) {
970
971	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
972
973
974	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
975	if ($verbosity > 1);
976
977	# Read the thesaurus terms
978	my ($num, $text, %thes_symbols);
979
980	open(TH, "<$thesaurus_terms");
981	while(<TH>) {
982	chomp;
983	@words = split(/\s+/, $_);
984	$num = shift @words;
985	$text = "";
986
987	# translate words into symbol numbers
988	foreach $word (@words) {
989	$word = lc($word);
990	if ($symbol{$word}) {
991	$text .= "s$symbol{$word} ";
992	} elsif ($verbosity) {
993	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
994	}
995	}
996	$text =~ s/ $//;
997	$thes_symbols{$num} = $text;
998	}
999	close TH;
1000
1001	# Read the thesaurus links and write the corresponding data
1002	open(TH, "<$thesaurus_links");
1003	open(THOUT, ">$thesaurusfile");
1004
1005	while(<TH>) {
1006	chomp;
1007	($num, $text) = split(/:/, $_);
1008
1009	if (defined($thes_symbols{$num})) {
1010	print THOUT "$num:$thes_symbols{$num}:$text\n";
1011	} else {
1012	print THOUT "$num:untranslated:$text\n";
1013	}
1014	}
1015	close TH;
1016	close THOUT;
1017	}
1018
1019
1020
1021
1022	}
1023
1024
1025	# renumber_phrases
1026	#
1027	# Prepare the phrases file to be input to mgpp. The biggest problem is
1028	# reconciling the phrase identifiers used by the suffix program (which
1029	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1030	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1031	# Phind-id numbers must be sorted by frequency of occurance.
1032	#
1033	# Start creating a set of phind-id numbers from the sorted suffix-id
1034	# numbers and (if required) the thesaurus-id numbers. Then add any other
1035	# phrases occuring in the thesaurus.
1036	#
1037	# The last thing we have to do is restore the vocabulary information to the
1038	# phrase file so that the phrases are stored as words, not as symbol
1039	# numbers.
1040
1041	# The original phrases file looks something like this:
1042	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1043	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1044	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1045	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1046
1047
1048	sub renumber_phrases {
1049	my ($self) = @_;
1050
1051	renumber_suffix_data($self);
1052	renumber_thesaurus_data($self);
1053	restore_vocabulary_data($self);
1054
1055	}
1056
1057
1058
1059	# renumber_suffix_data
1060	#
1061	# Translate phrases file to phrases.2 using phind keys instead
1062	# of suffix keys and sorting the expansion data.
1063
1064	sub renumber_suffix_data {
1065	my ($self) = @_;
1066
1067	my $verbosity = $self->{'verbosity'};
1068	my $out = $self->{'outhandle'};
1069	print $out "Translate phrases: suffix-ids become phind-id's\n"
1070	if ($verbosity);
1071
1072	my $phinddir = $self->{'phinddir'};
1073	my $infile = &util::filename_cat($phinddir, 'phrases');
1074	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1075
1076	# Read the phrase file. Calculate initial set of phind-id
1077	# numbers and store (suffixid -> frequency) relation.
1078
1079	my %suffixtophind;
1080	my @totalfrequency;
1081	my (@fields, $suffixid);
1082	my $nextphind = 1;
1083
1084	open(IN, "<$infile");
1085	while(<IN>) {
1086
1087	chomp;
1088	@fields = split(/:/, $_);
1089
1090	# get next suffixid and phindid
1091	$suffixid = shift @fields;
1092	$suffixtophind{$suffixid} = $nextphind;
1093
1094	# store total frequency
1095	shift @fields;
1096	$totalfrequency[$nextphind] = shift @fields;
1097
1098	$nextphind++;
1099	}
1100	close IN;
1101
1102
1103	# Translate phrases file to phrases.2. Use phind keys (not suffix
1104	# keys), sort expansion and document occurance data in order of
1105	# descending frequency..
1106	open(IN, "<$infile");
1107	open(OUT, ">$outfile");
1108
1109	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1110	my (@documwents, @newexp, $k, $n);
1111	my $linenumber = 0;
1112
1113	while(<IN>) {
1114
1115	# read the line
1116	chomp;
1117	@fields = split(/:/, $_);
1118
1119	# get a phrase number for this line
1120	$suffixid = shift @fields;
1121	die unless (defined($suffixtophind{$suffixid}));
1122	$phindid = $suffixtophind{$suffixid};
1123
1124	# get the symbols in the phrase
1125	$text = shift @fields;
1126
1127	# output status information
1128	$linenumber++;
1129	if ($verbosity > 2) {
1130	if ($linenumber % 1000 == 0) {
1131	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1132	}
1133	# what are $num and $key??
1134	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1135	}
1136
1137	# get the phrase frequency
1138	$tf = shift @fields;
1139
1140	# get the number of expansions
1141	$countexp = shift @fields;
1142
1143	# get the expansions, convert them into phind-id numbers, and sort them
1144	$expansions = shift @fields;
1145	@newexp = ();
1146	foreach $k (split(/,/, $expansions)) {
1147	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1148	$n = $suffixtophind{$k};
1149	push @newexp, $n;
1150	}
1151	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1152
1153	# get the number of documents
1154	$countdocs = shift @fields;
1155
1156	# get the documents and sort them
1157	$documents = shift @fields;
1158	$documents =~ s/d//g;
1159	my @documents = split(/;/, $documents);
1160	@documents = sort by_doc_frequency @documents;
1161
1162	# output the phrase data
1163	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1164	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1165
1166	}
1167
1168	close IN;
1169	close OUT;
1170	}
1171
1172
1173	# renumber_thesaurus_data
1174	#
1175	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1176
1177	sub renumber_thesaurus_data {
1178	my ($self) = @_;
1179
1180	my $out = $self->{'outhandle'};
1181	my $verbosity = $self->{'verbosity'};
1182	my $thesaurus = $self->{'thesaurus'};
1183
1184	my $phinddir = $self->{'phinddir'};
1185	my $infile = &util::filename_cat($phinddir, "phrases.2");
1186	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1187
1188
1189	# If no thesaurus is defined, simply move the phrases file.
1190	if (!$thesaurus) {
1191	print $out "Translate phrases.2: no thesaurus data\n"
1192	if ($verbosity);
1193	&util::mv($infile, $outfile);
1194	return;
1195	}
1196
1197	print $out "Translate phrases.2: add thesaurus data\n"
1198	if ($verbosity);
1199
1200	# 1.
1201	# Read thesaurus file and store (symbols->thesaurusid) mapping
1202	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1203	my %symbolstothesid;
1204	my (@fields, $thesid, $symbols);
1205
1206	open(TH, "<$thesaurusfile");
1207
1208	while (<TH>) {
1209
1210	chomp;
1211	@fields = split(/:/, $_);
1212
1213	# get id and text
1214	$thesid = shift @fields;
1215	$symbols = shift @fields;
1216	$symbolstothesid{$symbols} = $thesid;
1217	}
1218	close TH;
1219
1220	# 2.
1221	# Read phrases file to find thesaurus entries that already
1222	# have a phindid. Store their phind-ids for later translation,
1223	# and store their frequency for later sorting.
1224	my %thesaurustophindid;
1225	my %phindidtofrequency;
1226	my ($phindid, $freq);
1227
1228	open(IN, "<$infile");
1229
1230	while(<IN>) {
1231
1232	chomp;
1233	@fields = split(/:/, $_);
1234
1235	# phindid and symbols for this line
1236	$phindid = shift @fields;
1237	$symbols = shift @fields;
1238	$freq = shift @fields;
1239
1240	# do we have a thesaurus id corresponding to this phrase?
1241	if (defined($symbolstothesid{$symbols})) {
1242	$thesid = $symbolstothesid{$symbols};
1243	$thesaurustophindid{$thesid} = $phindid;
1244	$phindidtofrequency{$phindid} = $freq;
1245	}
1246	}
1247	close IN;
1248
1249	undef %symbolstothesid;
1250
1251	# 3.
1252	# Create phind-id numbers for remaining thesaurus entries,
1253	# and note that their frequency is 0 for later sorting.
1254	my $nextphindid = $phindid + 1;
1255
1256	open(TH, "<$thesaurusfile");
1257	while(<TH>) {
1258
1259	chomp;
1260	@fields = split(/:/, $_);
1261
1262	# read thesaurus-id and ensure it has a corresponding phind-id
1263	$thesid = shift @fields;
1264	if (!defined($thesaurustophindid{$thesid})) {
1265	$thesaurustophindid{$thesid} = $nextphindid;
1266	$phindidtofrequency{$nextphindid} = 0;
1267	$nextphindid++;
1268	}
1269	}
1270	close TH;
1271
1272	# 4.
1273	# Translate thesaurus file, replacing thesaurus-id numbers with
1274	# phind-id numbers.
1275	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1276	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1277	my (@links, $linkid, %linkidtotype, $newrelation);
1278
1279	open(TH, "<$thesaurusfile");
1280	open(TO, ">$newthesaurusfile");
1281	while(<TH>) {
1282
1283	chomp;
1284	@fields = split(/:/, $_);
1285
1286	# phindid and symbols for this line
1287	($thesid, $symbols, $relations) = @fields;
1288
1289	die unless ($thesid && $symbols);
1290	die unless $thesaurustophindid{$thesid};
1291	$phindid = $thesaurustophindid{$thesid};
1292
1293	# convert each part of the relation string to use phind-id numbers
1294	# at the same time, we want to sort the list by frequency.
1295	undef %linkidtotype;
1296
1297	foreach $linktext (split(/;/, $relations)) {
1298	@linkdata = split(/,/, $linktext);
1299
1300	# remember the linktype (e.g. BT, NT)
1301	$linktype = shift @linkdata;
1302
1303	# store the type of each link
1304	foreach $thesid (@linkdata) {
1305	die unless (defined($thesaurustophindid{$thesid}));
1306	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1307	}
1308	}
1309
1310	# sort the list of links, first by frequency, then by type.
1311	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1312	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1313	$linkcounter = (scalar @links);
1314
1315	# create a string describing the link information
1316	$linktype = $linkidtotype{$links[0]};
1317	$newrelation = $linktype;
1318	foreach $linkid (@links) {
1319	if ($linkidtotype{$linkid} ne $linktype) {
1320	$linktype = $linkidtotype{$linkid};
1321	$newrelation .= ";" . $linktype;
1322	}
1323	$newrelation .= "," . $linkid;
1324	}
1325	$newrelation .= ";";
1326
1327
1328	# output the new line
1329	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1330	}
1331	close TH;
1332	close TO;
1333
1334	undef %thesaurustophindid;
1335	undef %linkidtotype;
1336	undef %phindidtofrequency;
1337
1338	# 5.
1339	# Read thesaurus data (in phind-id format) into memory
1340	my %thesaurusdata;
1341
1342	open(TH, "<$newthesaurusfile");
1343	while(<TH>) {
1344	chomp;
1345	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1346	die unless ($phindid && $symbols);
1347	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1348	}
1349	close TH;
1350
1351	# 6.
1352	# Add thesaurus data to phrases file
1353	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1354	my (@documwents, @newexp, $k, $n);
1355	my $linenumber = 0;
1356
1357	open(IN, "<$infile");
1358	open(OUT, ">$outfile");
1359
1360	# Update existing phrases
1361	while(<IN>) {
1362
1363	chomp;
1364	@fields = split(/:/, $_);
1365
1366	# get data for this line
1367	$phindid = shift @fields;
1368
1369	# output the phrase data, with thesaurus information
1370	print OUT "$phindid:", join(":", @fields);
1371
1372	# add thesaurus data
1373	if (defined($thesaurusdata{$phindid})) {
1374	@fields = split(/:/, $thesaurusdata{$phindid});
1375	shift @fields;
1376	$linkcounter = shift @fields;
1377	$relations = shift @fields;
1378
1379	print OUT ":$linkcounter:$relations";
1380	$thesaurusdata{$phindid} = "";
1381	}
1382	print OUT "\n";
1383	}
1384	close IN;
1385
1386	# Add phrases that aren't already in the file
1387	foreach $phindid (sort numerically keys %thesaurusdata) {
1388	next unless ($thesaurusdata{$phindid});
1389
1390	@fields = split(/:/, $thesaurusdata{$phindid});
1391	$symbols = shift @fields;
1392	$linkcounter = shift @fields;
1393	$relations = shift @fields;
1394
1395	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1396	}
1397	close OUT;
1398
1399	}
1400
1401	# restore_vocabulary_data
1402	#
1403	# Read phrases.3 and restore vocabulary information. Then write
1404	# this data to the MGPP input files (pword.txt and pdata.txt) and
1405	# (if requested) to the saved phrases file.
1406
1407	sub restore_vocabulary_data {
1408	my ($self) = @_;
1409
1410	my $out = $self->{'outhandle'};
1411	my $verbosity = $self->{'verbosity'};
1412	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1413
1414	my $phinddir = $self->{'phinddir'};
1415	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1416	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1417	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1418	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1419
1420	my $savephrases = $self->{'savephrases'};
1421
1422	# 1.
1423	# Read the vocabulary file
1424	open(V, "<$vocabfile")
1425	\|\| die "Cannot open $vocabfile: $!";
1426	my @symbol;
1427	my $i = 1;
1428	while(<V>) {
1429	chomp;
1430	$symbol[$i++] = $_;
1431	}
1432	close V;
1433
1434	# 2.
1435	# Translate phrases.3 to MGPP input files
1436	my ($key, $text, $word, $isThesaurus, $line);
1437	my @fields;
1438	my $linenumber = 0;
1439
1440	open(IN, "<$infile");
1441	open(DATA, ">$datafile");
1442	open(WORD, ">$wordfile");
1443
1444	# Save the phrases in a separate text file
1445	if ($savephrases) {
1446	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1447	open(SAVE, ">$savephrases");
1448	}
1449
1450	while(<IN>) {
1451
1452	# read the line
1453	chomp;
1454	$line = $_;
1455	@fields = split(/:/, $line);
1456
1457	# get a phrase number for this line
1458	$key = shift @fields;
1459
1460	# restore the text of the phrase
1461	$text = shift @fields;
1462	$text =~ s/s(\d+)/$symbol[$1]/g;
1463	if ($text =~ / /) {
1464	$word = "";
1465	} elsif ($text ne 'untranslated') {
1466	$word = $text;
1467	}
1468
1469	# output the phrase data
1470	print DATA "<Document>";
1471	print DATA "$key:$text:", join(":", @fields), ":\n";
1472
1473	# output the word index search data
1474	print WORD "<Document>$word\n";
1475
1476	# output the phrases to a text file
1477	if ($savephrases) {
1478	if ((scalar @fields) == 7) {
1479	$isThesaurus = 1;
1480	} else {
1481	$isThesaurus = 0;
1482	}
1483	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1484	}
1485	}
1486	close IN;
1487	close WORD;
1488	close DATA;
1489	close SAVE if ($savephrases);
1490
1491	}
1492
1493
1494
1495	# sort routines used to renumber phrases
1496
1497	sub numerically { $a <=> $b }
1498
1499	sub by_doc_frequency {
1500	my $fa = 1;
1501	if ($a =~ /,/) {
1502	$fa = $a;
1503	$fa =~ s/\d+,//;
1504	}
1505	my $fb = 1;
1506	if ($b =~ /,/) {
1507	$fb = $b;
1508	$fb =~ s/\d+,//;
1509	}
1510
1511	return ($fb <=> $fa);
1512	}
1513
1514	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: