Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 22731

Last change on this file since 22731 was 22382, checked in by kjdon, 14 years ago
if language is ar for phind, then we set textorientation parameter to rtl.
Property svn:keywords set to `Author Date Id Revision`
File size: 41.1 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	package Phind;
33
34	use BaseClassifier;
35	use util;
36	use ghtml;
37	use unicode;
38
39	use strict;
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	my @removedirs = ();
43
44	my %wanted_index_files = ('td'=>1,
45	't'=>1,
46	'ti'=>1,
47	'tl'=>1,
48	'tsd'=>1,
49	'idb'=>1,
50	'ib1'=>1,
51	'ib2'=>1,
52	'ib3'=>1,
53	'i'=>1,
54	'il'=>1,
55	'w'=>1,
56	'wa'=>1);
57
58	sub BEGIN {
59	@Phind::ISA = ('BaseClassifier');
60	}
61
62	sub END {
63
64	# Tidy up stray files - we do this here as there's some weird problem
65	# preventing us from doing it in the get_classify_info() function (on
66	# windows at least) where the close() appears to fail on txthandle and
67	# dochandle, thus preventing us from deleting those files
68
69	foreach my $dir (@removedirs) {
70	if (-d $dir && opendir (DIR, $dir)) {
71	my @files = readdir DIR;
72	closedir DIR;
73
74	foreach my $file (@files) {
75	next if $file =~ /^\.\.?$/;
76	my ($suffix) = $file =~ /\.([^\.]+)$/;
77	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
78	# delete it!
79	&util::rm (&util::filename_cat ($dir, $file));
80	}
81	}
82	}
83	}
84	}
85
86	my $arguments =
87	[ { 'name' => "text",
88	'desc' => "{Phind.text}",
89	'type' => "string",
90	'deft' => "section:Title,section:text",
91	'reqd' => "no" },
92	{ 'name' => "title",
93	'desc' => "{Phind.title}",
94	'type' => "metadata",
95	'deft' => "Title",
96	'reqd' => "no" },
97	{ 'name' => "buttonname",
98	'desc' => "{BasClas.buttonname}",
99	'type' => "string",
100	'deft' => "Phrase",
101	'reqd' => "no" },
102	{ 'name' => "language",
103	'desc' => "{Phind.language}",
104	'type' => "string",
105	'deft' => "en",
106	'reqd' => "no" },
107	{ 'name' => "savephrases",
108	'desc' => "{Phind.savephrases}",
109	'type' => "string",
110	'deft' => "",
111	'reqd' => "no" },
112	{ 'name' => "suffixmode",
113	'desc' => "{Phind.suffixmode}",
114	'type' => "int",
115	'deft' => "1",
116	'range' => "0,1",
117	'reqd' => "no" },
118	{ 'name' => "min_occurs",
119	'desc' => "{Phind.min_occurs}",
120	'type' => "int",
121	'deft' => "2",
122	'range' => "1,",
123	'reqd' => "no" },
124	{ 'name' => "thesaurus",
125	'desc' => "{Phind.thesaurus}",
126	'type' => "string",
127	'deft' => "",
128	'reqd' => "no" },
129	{ 'name' => "untidy",
130	'desc' => "{Phind.untidy}",
131	'type' => "flag",
132	'reqd' => "no" } ];
133
134	my $options = { 'name' => "Phind",
135	'desc' => "{Phind.desc}",
136	'abstract' => "no",
137	'inherits' => "yes",
138	'args' => $arguments };
139
140
141	# Phrase delimiter symbols - these should be abstracted out someplace
142
143	my $colstart = "COLLECTIONSTART";
144	my $colend = "COLLECTIONEND";
145	my $doclimit = "DOCUMENTLIMIT";
146	my $senlimit = "SENTENCELIMIT";
147	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150	# Create a new Phind browser based on collect.cfg
151
152	sub new {
153	my ($class) = shift (@_);
154	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155	push(@$classifierslist, $class);
156
157	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158	push(@{$hashArgOptLists->{"OptList"}},$options);
159
160	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
161
162	if ($self->{'info_only'}) {
163	# don't worry about any options etc
164	return bless $self, $class;
165	}
166
167	# Ensure the Phind generate scripts are in place
168	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171	if (!(-e $file1)) {
172	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173	exit(1);
174	}
175
176	# things that may have ex. in them that need to be stripped off
177	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
180	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
181
182	$self->{"indexes"} = $self->{"text"};
183
184	# Further setup
185	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
187	if (! defined $self->{'builddir'}) {
188	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189	}
190	$self->{'total'} = 0;
191
192	# we set phind to be rtl if language is arabic
193	if ($self->{'language'} eq "ar") {
194	$self->{'textorientation'} = "rtl";
195	}
196	# Clean out the unused keys
197	delete $self->{"text"};
198
199	return bless $self, $class;
200	}
201
202
203	# Initialise the Phind classifier
204
205	sub init {
206	my $self = shift (@_);
207
208	# ensure we have a build directory
209	my $builddir = $self->{'builddir'};
210	die unless (-e "$builddir");
211
212	# create Phind directory
213	my $phnumber = 1;
214	my $phinddir = &util::filename_cat($builddir, "phind1");
215	while (-e "$phinddir") {
216	$phnumber++;
217	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
218	}
219	&util::mk_dir("$phinddir");
220	$self->{'phinddir'} = $phinddir;
221	$self->{'phindnumber'} = $phnumber;
222
223	push(@removedirs, $phinddir) unless $self->{'untidy'};
224
225	# open filehandles for documents and text
226	my $clausefile = &util::filename_cat("$phinddir", "clauses");
227	&util::rm($clausefile) if (-e $clausefile);
228
229	my $txthandle = 'TEXT' . $phnumber;
230	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
231	$self->{'txthandle'} = $txthandle;
232
233	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234	&util::rm($docfile) if (-e $docfile);
235
236	my $dochandle = 'DOC' . $phnumber;
237	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
238	$self->{'dochandle'} = $dochandle;
239
240	}
241
242
243	# Classify each document.
244	#
245	# Each document is passed here in turn. The classifier extracts the
246	# text of each and stores it in the clauses file. Document details are
247	# stored in the docs.txt file.
248
249	sub classify {
250	my $self = shift (@_);
251	my ($doc_obj,$edit_mode) = @_;
252
253	my $verbosity = $self->{'verbosity'};
254	my $top_section = $doc_obj->get_top_section();
255
256	my $titlefield = $self->{'title'};
257
258	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
259	if (!defined($title)) {
260	$title = "";
261	print STDERR "Phind: document has no title\n";
262	}
263	print "process: $title\n" if ($verbosity > 2);
264
265	# Only consider the file if it is in the correct language
266	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
267	my $phrlanguage = $self->{'language'};
268	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
269
270	if ($edit_mode eq "delete") {
271	# This classifier works quite differently to the others
272	# Probably doesn't support incremental building anyway
273	return;
274	}
275
276	# record this file
277	$self->{'total'} ++;
278	# what is $file ???
279	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
280
281	# Store document details
282	my $OID = $doc_obj->get_OID();
283	$OID = "NULL" unless defined $OID;
284	my $dochandle = $self->{'dochandle'};
285	print $dochandle "<Document>\t$OID\t$title\n";
286
287	# Store the text occuring in this object
288
289	# output the document delimiter
290	my $txthandle = $self->{'txthandle'};
291	print $txthandle "$doclimit\n";
292
293	# iterate over the required indexes and store their text
294	my $indexes = $self->{'indexes'};
295	my $text = "";
296	my ($part, $level, $field, $section, $data, $dataref);
297
298	foreach $part (split(/,/, $indexes)) {
299
300	# Each field has a level and a data element ((e.g. document:Title)
301	($level, $field) = split(/:/, $part);
302	die unless ($level && $field);
303
304	# Extract the text from every section
305	# (In phind, document:text and section:text are equivalent)
306	if ($field eq "text") {
307	$data = "";
308	$section = $doc_obj->get_top_section();
309	while (defined($section)) {
310	$data .= $doc_obj->get_text($section) . "\n";
311	$section = $doc_obj->get_next_section($section);
312	}
313	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
314	}
315
316	# Extract a metadata field from a document
317	# (If there is more than one element of the given type, get them all.)
318	elsif ($level eq "document") {
319	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
320	foreach $data (@$dataref) {
321	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
322	}
323	}
324
325	# Extract metadata from every section in a document
326	elsif ($level eq "section") {
327	$data = "";
328	$section = $doc_obj->get_top_section();
329	while (defined($section)) {
330	$dataref = $doc_obj->get_metadata($section, $field);
331	$data .= join("\n", @$dataref) . "\n";
332	$section = $doc_obj->get_next_section($section);
333	}
334	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
335	}
336
337	# Some sort of specification which I don't understand
338	else {
339	die "Unknown level ($level) in Phind index ($part)\n";
340	}
341
342	}
343
344	# output the text
345	$text =~ tr/\n//s;
346	print $txthandle "$text";
347	}
348
349
350	# Construct the classifier from the information already gathered
351	#
352	# When get_classify_info is called, the clauses and docs.txt files have
353	# already been constructed in the Phind directory. This function will
354	# translate them into compressed, indexed MGPP files that can be read by
355	# the phindcgi script. It will also register our classifier so that it
356	# shows up in the navigation bar.
357
358	sub get_classify_info {
359	my $self = shift (@_);
360	my ($gli) = @_;
361
362	close $self->{'dochandle'};
363	close $self->{'txthandle'};
364	my $verbosity = $self->{'verbosity'};
365	my $out = $self->{'outhandle'};
366	my $phinddir = $self->{'phinddir'};
367
368	my $osextra = "";
369	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
370	$osextra = " -d /";
371	}
372
373	print STDERR "</Stage>\n" if $gli;
374
375	if ($verbosity) {
376	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
377	print $out "*** in ", $self->{'phinddir'}, "\n";
378	}
379
380	print STDERR "<Stage name='Phind'>\n" if $gli;
381
382	# Construct phind indexes
383	my $suffixmode = $self->{'suffixmode'};
384	my $min_occurs = $self->{'min_occurs'};
385	my ($command, $status);
386
387	# Generate the vocabulary, symbol statistics, and numbers file
388	# from the clauses file
389	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
390	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
391	&extract_vocabulary($self);
392
393	# Use the suffix program to generate the phind/phrases file
394	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
395	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
396	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
397
398	# check that we generated some files. It's not necessarily an error if
399	# we didn't (execute() would have quit on error), but we can't go on.
400	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
401	if (! -r $phrasesfile) {
402	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
403	print $out "\nNo phrases found for Phind classifier!\n";
404	return;
405	}
406
407	# Create the phrase file and put phrase numbers in phind/phrases
408	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
409	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
410	&renumber_phrases($self);
411
412	print $out "\nCreating phrase databases\n";
413	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
414	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
415	my $mg_stem = &util::filename_cat($phinddir, "pdata");
416
417	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
418	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
419	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
420
421	# create the mg index of words
422	print $out "\nCreating word-level search indexes\n";
423	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
424	$mg_input = &util::filename_cat($phinddir, "pword.txt");
425	$mg_stem = &util::filename_cat($phinddir, "pword");
426
427	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
428	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
429	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
430	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
431	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
432	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
433
434	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
435	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
436	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
437
438	# create the mg document information database
439	print $out "\nCreating document information databases\n";
440	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
441	$mg_input = &util::filename_cat($phinddir, "docs.txt");
442	$mg_stem = &util::filename_cat($phinddir, "docs");
443
444	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
445	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
446	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
447
448	my $parameters = "phindnumber=$self->{'phindnumber'}";
449	if (defined ($self->{'textorientation'})) {
450	$parameters .= ";textorientation=$self->{'textorientation'}";
451	}
452	# Return the information about the classifier that we'll later want to
453	# use to create macros when the Phind classifier document is displayed.
454	my %classifyinfo = ('thistype'=>'Invisible',
455	'childtype'=>'Phind',
456	'Title'=>$self->{'buttonname'},
457	'parameters'=>$parameters,
458	'contains'=>[]);
459
460	my $collection = $self->{'collection'};
461	my $url = "library?a=p&p=phind&c=$collection";
462	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
463
464	return \%classifyinfo;
465	}
466
467
468
469	sub convert_gml_to_tokens {
470
471	my ($language_exp, $text) = @_;
472
473	# escape any magic words... - jrm21
474	foreach my $delim (@delimiters) {
475	my $replacement=lc($delim);
476	my $num= $text=~ s/$delim/$replacement/g;
477	if (!$num) {$num=0;}
478	}
479
480	if ($language_exp =~ /^en$/) {
481	return &convert_gml_to_tokens_EN($text);
482	}
483
484	if ($language_exp =~ /zh/) {
485	return &convert_gml_to_tokens_ZH($text);
486	}
487
488	$_ = $text;
489
490	# 1. remove GML tags
491
492	# Remove everything that is in a tag
493	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
494	s/\s<br>\s/ LINEBREAK /isgo;
495	s/<[^>]*>/ /sgo;
496
497	# Now we have the text, but it may contain HTML
498	# elements coded as > etc. Remove these tags.
499	s/&/&/sgo;
500	s/</</sgo;
501	s/>/>/sgo;
502	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
503	s/\s<br>\s/ LINEBREAK /isgo;
504	s/<[^>]*>/ /sgo;
505
506	# replace<p> and <br> placeholders with clause break symbol (\n)
507	s/\s+/ /gso;
508	s/PARAGRAPHBREAK/\n/sgo;
509	s/LINEBREAK/\n/sgo;
510
511
512	# 2. Split the remaining text into space-delimited tokens
513
514	# Convert any HTML special characters (like ") to their UTF8 equivalent
515	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
516
517	# Split text at word boundaries
518	s/\b/ /go;
519
520	# 3. Convert the remaining text to "clause format"
521
522	# Insert newline if the end of a sentence is detected
523	# (delimter is: "[\.\?\!]\s")
524	# s/\s*[\.\?\!]\s+/\n/go;
525
526	# remove unnecessary punctuation and replace with clause break symbol (\n)
527	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
528	#s/[^\w ]/\n/go;
529	# replace punct with new lines - is this what we want??
530	s/\s[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s/\n/go; #"
531	# then remove other punct with space
532	s/[\'\`\\\_]/ /go;
533
534	# remove extraneous whitespace
535	s/ +/ /sgo;
536	s/^\s+//mgo;
537	s/\s*$/\n/mgo;
538
539	# remove lines that contain one word or less
540	s/^\S*$//mgo;
541	s/^\s*$//mgo;
542	tr/\n//s;
543
544	return $_;
545	}
546
547	# a chinese version
548	sub convert_gml_to_tokens_ZH {
549
550	$_ = shift @_;
551
552	# Replace all whitespace with a simple space
553	s/\s+/ /gs;
554	# Remove everything that is in a tag
555	s/\s<p>\s/ PARAGRAPHBREAK /isg;
556	s/\s<br>\s/ LINEBREAK /isg;
557	s/<[^>]*>/ /sg;
558
559	# Now we have the text, but it may contain HTML
560	# elements coded as > etc. Remove these tags.
561	s/</</sg;
562	s/>/>/sg;
563
564	s/\s+/ /sg;
565	s/\s<p>\s/ PARAGRAPHBREAK /isg;
566	s/\s<br>\s/ LINEBREAK /isg;
567	s/<[^>]*>/ /sg;
568
569	# remove & and other miscellaneous markup tags
570	s/&/&/sg;
571	s/</</sg;
572	s/>/>/sg;
573	s/&/&/sg;
574
575	# replace<p> and <br> placeholders with carriage returns
576	s/PARAGRAPHBREAK/\n/sg;
577	s/LINEBREAK/\n/sg;
578
579
580	# print STDERR "text:$_\n";
581	return $_;
582	}
583
584	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
585
586	sub convert_gml_to_tokens_EN {
587	$_ = shift @_;
588
589	# FIRST, remove GML tags
590
591	# Replace all whitespace with a simple space
592	s/\s+/ /gs;
593
594	# Remove everything that is in a tag
595	s/\s<p>\s/ PARAGRAPHBREAK /isg;
596	s/\s<br>\s/ LINEBREAK /isg;
597	s/<[^>]*>/ /sg;
598
599	# Now we have the text, but it may contain HTML
600	# elements coded as > etc. Remove these tags.
601	s/</</sg;
602	s/>/>/sg;
603
604	s/\s+/ /sg;
605	s/\s<p>\s/ PARAGRAPHBREAK /isg;
606	s/\s<br>\s/ LINEBREAK /isg;
607	s/<[^>]*>/ /sg;
608
609	# remove & and other miscellaneous markup tags
610	s/&/&/sg;
611	s/</</sg;
612	s/>/>/sg;
613	s/&/&/sg;
614
615	# replace<p> and <br> placeholders with carriage returns
616	s/PARAGRAPHBREAK/\n/sg;
617	s/LINEBREAK/\n/sg;
618
619
620	# Exceptional punctuation
621	#
622	# We make special cases of some punctuation
623
624	# remove any apostrophe that indicates omitted letters
625	s/(\w+)\'(\w*\s)/ $1$2 /g;
626
627	# remove period that appears in a person's initals
628	s/\s([A-Z])\./ $1 /g;
629
630	# replace hyphens in hypheanted words and names with a space
631	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
632
633	# Convert the remaining text to "clause format",
634	# This means removing all excess punctuation and garbage text,
635	# normalising valid punctuation to fullstops and commas,
636	# then putting one cluse on each line.
637
638	# Insert newline when the end of a sentence is detected
639	# (delimter is: "[\.\?\!]\s")
640	s/\s*[\.\?\!]\s+/\n/g;
641
642	# split numbers after four digits
643	s/(\d\d\d\d)/$1 /g;
644
645	# split words after 32 characters
646
647	# squash repeated punctuation
648	tr/A-Za-z0-9 //cs;
649
650	# save email addresses
651	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
652
653	# normalise clause breaks (mostly punctuation symbols) to commas
654	s/[^A-Za-z0-9 \n]+/ , /g;
655
656	# Remove repeated commas, and replace with newline
657	s/\s*,[, ]+/\n/g;
658
659	# remove extra whitespace
660	s/ +/ /sg;
661	s/^\s+//mg;
662	s/\s*$/\n/mg;
663
664	# remove lines that contain one word or less
665	s/^\w*$//mg;
666	s/^\s*$//mg;
667	tr/\n//s;
668
669	return $_;
670
671	}
672
673
674
675	# Execute a system command
676
677	sub execute {
678	my ($command, $verbosity, $outhandle) = @_;
679	print $outhandle "Executing: $command\n" if ($verbosity > 2);
680	$! = 0;
681	my $status = system($command);
682	if ($status != 0) {
683	print STDERR "Phind - Error executing '$command': $!\n";
684	exit($status); # this causes the build to fail...
685	}
686	}
687
688
689	# Generate the vocabulary, symbol statistics, and numbers file from the
690	# clauses file. This is legacy code, so is a bit messy and probably wont
691	# run under windows.
692
693	sub extract_vocabulary {
694	my ($self) = @_;
695
696	my $verbosity = $self->{'verbosity'};
697	my $out = $self->{'outhandle'};
698
699	my $collectiondir = $self->{'collectiondir'};
700	my $phinddir = $self->{'phinddir'};
701
702	my $language_exp = $self->{'language'};
703
704	my ($w, $l, $line, $word);
705
706	my ($first_delimiter, $last_delimiter,
707	$first_stopword, $last_stopword,
708	$first_extractword, $last_extractword,
709	$first_contentword, $last_contentword,
710	$phrasedelimiter);
711
712	my $thesaurus = $self->{'thesaurus'};
713	my ($thesaurus_links, $thesaurus_terms,
714	%thesaurus, $first_thesaurusword, $last_thesaurusword);
715
716	my %symbol;
717	my (%freq);
718
719	print $out "Calculating vocabulary\n" if ($verbosity > 1);
720
721	# Read and store the stopwords
722	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
723	my $stopword_files = ();
724	my ($language, $language_dir, $file, $file_name);
725	my %stopwords;
726
727	# Examine each directory in the stopword directory
728	opendir(STOPDIR, $stopdir);
729	foreach $language (readdir STOPDIR) {
730
731	# Ignore entries that do not match the classifier's language
732	next unless ($language =~ /$language_exp/);
733	$language_dir = &util::filename_cat($stopdir, $language);
734	next unless (-d "$language_dir");
735
736	opendir(LANGDIR, $language_dir);
737	foreach $file (readdir LANGDIR) {
738
739	# Ignore entries that are not stopword files
740	next unless ($file =~ /sw$/);
741	$file_name = &util::filename_cat($language_dir, $file);
742	next unless (-f "$file_name");
743
744	# Read the stopwords
745	open(STOPFILE, "<$file_name");
746	while (<STOPFILE>) {
747	s/^\s+//;
748	s/\s.*//;
749	$word = $_;
750	$l = lc($word);
751	$stopwords{$l} = $word;
752	}
753	close STOPFILE;
754
755	}
756	closedir LANGDIR;
757	}
758	closedir STOPDIR;
759
760	# Read thesaurus information
761	if ($thesaurus) {
762
763	# link file exists
764	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
765	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
766
767	# ensure term file exists in the correct language
768	if ($language_exp =~ /^([a-z][a-z])/) {
769	$language = $1;
770	} else {
771	$language = 'en';
772	}
773	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
774	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
775
776
777	# Read the thesaurus terms
778	open(TH, "<$thesaurus_terms");
779	while(<TH>) {
780	s/^\d+ //;
781	s/\(.*\)//;
782	foreach $w (split(/\s+/, $_)) {
783	$thesaurus{lc($w)} = $w;
784	}
785	}
786	close TH;
787	}
788
789	# Read words in the text and count occurences
790	open(TXT, "<$phinddir/clauses");
791
792	my @words;
793	while(<TXT>) {
794	$line = $_;
795	next unless ($line =~ /./);
796
797	@words = split(/\s+/, $line);
798	foreach $w (@words) {
799	$l = lc($w);
800	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
801	$freq{$w}++;
802	}
803	$freq{$senlimit}++;
804	}
805
806	close TXT;
807
808	# Calculate the "best" form of each word
809	my (%bestform, %totalfreq, %bestfreq);
810
811	foreach $w (sort (keys %freq)) {
812	$l = lc($w);
813
814	# totalfreq is the number of times a term appears in any form
815	$totalfreq{$l} += $freq{$w};
816
817	if (defined $stopwords{$l}) {
818	$bestform{$l} = $stopwords{$l};
819
820	} elsif (defined $thesaurus{$l}) {
821	$bestform{$l} = $thesaurus{$l};
822
823	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
824	$bestfreq{$l} = $freq{$w};
825	$bestform{$l} = $w;
826	}
827	}
828	undef %freq;
829	undef %bestfreq;
830
831
832	# Assign symbol numbers to tokens
833	my $nextsymbol = 1;
834	my (@vocab);
835
836	# Delimiters
837	$first_delimiter = 1;
838
839	foreach $word (@delimiters) {
840
841	# $word = lc($word); # jrm21
842	$word = uc($word);
843	$bestform{$word} = $word;
844	$vocab[$nextsymbol] = $word;
845	$symbol{$word} = $nextsymbol;
846	$nextsymbol++;
847	}
848	$last_delimiter = $nextsymbol - 1;
849	# Stopwords
850	$first_stopword = $nextsymbol;
851
852	foreach my $word (sort keys %stopwords) {
853	# don't include stopword unless it occurs in the text
854	$word = lc($word);
855	next unless ($totalfreq{$word});
856	next if ($symbol{$word});
857
858	$vocab[$nextsymbol] = $word;
859	$symbol{$word} = $nextsymbol;
860	$nextsymbol++;
861	}
862	$last_stopword = $nextsymbol - 1;
863	$first_contentword = $nextsymbol;
864
865	# Thesaurus terms
866	if ($thesaurus) {
867	$first_thesaurusword = $nextsymbol;
868
869	foreach my $word (sort keys %thesaurus) {
870
871	$word = lc($word);
872	next if ($symbol{$word});
873	$bestform{$word} = $thesaurus{$word};
874
875	$vocab[$nextsymbol] = $word;
876	$symbol{$word} = $nextsymbol;
877	$nextsymbol++;
878
879	}
880	$last_thesaurusword = $nextsymbol - 1;
881	}
882
883	# Other content words
884	$first_extractword = $nextsymbol;
885
886	foreach my $word (sort (keys %bestform)) {
887
888	next if ($symbol{$word});
889
890	$vocab[$nextsymbol] = $word;
891	$symbol{$word} = $nextsymbol;
892	$nextsymbol++;
893	}
894	$last_extractword = $nextsymbol - 1;
895	$last_contentword = $nextsymbol - 1;
896
897	# Outut the words
898	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
899	open(VOC, ">$phinddir/clauses.vocab");
900
901	for (my $i = 1; $i < $nextsymbol; $i++) {
902	$w = $vocab[$i];
903
904	print VOC "$bestform{$w}\n";
905	$totalfreq{$w} = 0 unless ($totalfreq{$w});
906	}
907	close VOC;
908
909
910	# Create statistics file
911	# Output statistics about the vocablary
912	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
913	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
914
915	open(STAT, ">$phinddir/clauses.stats")
916	\|\| die "Cannot open $phinddir/clauses.stats: $!";
917
918	print STAT "first_delimiter $first_delimiter\n";
919	print STAT "last_delimiter $last_delimiter\n";
920	print STAT "first_stopword $first_stopword\n";
921	print STAT "last_stopword $last_stopword\n";
922	if ($thesaurus) {
923	print STAT "first_thesaurusword $first_thesaurusword\n";
924	print STAT "last_thesaurusword $last_thesaurusword\n";
925	}
926	print STAT "first_extractword $first_extractword\n";
927	print STAT "last_extractword $last_extractword\n";
928	print STAT "first_contentword $first_contentword\n";
929	print STAT "last_contentword $last_contentword\n";
930	print STAT "first_symbol $first_delimiter\n";
931	print STAT "last_symbol $last_contentword\n";
932	print STAT "first_word $first_stopword\n";
933	print STAT "last_word $last_contentword\n";
934	close STAT;
935
936	undef @vocab;
937
938
939	# Create numbers file
940	# Save text as symbol numbers
941	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
942
943	open(TXT, "<$phinddir/clauses");
944	open(NUM, ">$phinddir/clauses.numbers");
945
946	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
947	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
948	$phrasedelimiter = $symbol{$senlimit};
949	print NUM "$symbol{$colstart}\n";
950
951	# set up the special symbols that delimit documents and sentences
952	while(<TXT>) {
953
954	# split sentence into a list of tokens
955	$line = $_;
956	next unless ($line =~ /./);
957	@words = split(/\s+/, $line);
958
959	# output one token at a time
960	foreach $word (@words) {
961	# don't lower-case special delimiters - jrm21
962	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
963	$word = lc($word);
964	}
965	print NUM "$symbol{$word}\n";
966	}
967
968	# output phrase delimiter
969	print NUM "$phrasedelimiter\n";
970	}
971
972	close TXT;
973	# print NUM "$symbol{lc($colend)}\n";# jrm21
974	print NUM "$symbol{$colend}\n";
975	close NUM;
976
977	# Save thesaurus data in one convienient file
978	if ($thesaurus) {
979
980	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
981
982
983	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
984	if ($verbosity > 1);
985
986	# Read the thesaurus terms
987	my ($num, $text, %thes_symbols);
988
989	open(TH, "<$thesaurus_terms");
990	while(<TH>) {
991	chomp;
992	@words = split(/\s+/, $_);
993	$num = shift @words;
994	$text = "";
995
996	# translate words into symbol numbers
997	foreach $word (@words) {
998	$word = lc($word);
999	if ($symbol{$word}) {
1000	$text .= "s$symbol{$word} ";
1001	} elsif ($verbosity) {
1002	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1003	}
1004	}
1005	$text =~ s/ $//;
1006	$thes_symbols{$num} = $text;
1007	}
1008	close TH;
1009
1010	# Read the thesaurus links and write the corresponding data
1011	open(TH, "<$thesaurus_links");
1012	open(THOUT, ">$thesaurusfile");
1013
1014	while(<TH>) {
1015	chomp;
1016	($num, $text) = split(/:/, $_);
1017
1018	if (defined($thes_symbols{$num})) {
1019	print THOUT "$num:$thes_symbols{$num}:$text\n";
1020	} else {
1021	print THOUT "$num:untranslated:$text\n";
1022	}
1023	}
1024	close TH;
1025	close THOUT;
1026	}
1027
1028
1029
1030
1031	}
1032
1033
1034	# renumber_phrases
1035	#
1036	# Prepare the phrases file to be input to mgpp. The biggest problem is
1037	# reconciling the phrase identifiers used by the suffix program (which
1038	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1039	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1040	# Phind-id numbers must be sorted by frequency of occurance.
1041	#
1042	# Start creating a set of phind-id numbers from the sorted suffix-id
1043	# numbers and (if required) the thesaurus-id numbers. Then add any other
1044	# phrases occuring in the thesaurus.
1045	#
1046	# The last thing we have to do is restore the vocabulary information to the
1047	# phrase file so that the phrases are stored as words, not as symbol
1048	# numbers.
1049
1050	# The original phrases file looks something like this:
1051	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1052	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1053	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1054	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1055
1056
1057	sub renumber_phrases {
1058	my ($self) = @_;
1059
1060	renumber_suffix_data($self);
1061	renumber_thesaurus_data($self);
1062	restore_vocabulary_data($self);
1063
1064	}
1065
1066
1067
1068	# renumber_suffix_data
1069	#
1070	# Translate phrases file to phrases.2 using phind keys instead
1071	# of suffix keys and sorting the expansion data.
1072
1073	sub renumber_suffix_data {
1074	my ($self) = @_;
1075
1076	my $verbosity = $self->{'verbosity'};
1077	my $out = $self->{'outhandle'};
1078	print $out "Translate phrases: suffix-ids become phind-id's\n"
1079	if ($verbosity);
1080
1081	my $phinddir = $self->{'phinddir'};
1082	my $infile = &util::filename_cat($phinddir, 'phrases');
1083	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1084
1085	# Read the phrase file. Calculate initial set of phind-id
1086	# numbers and store (suffixid -> frequency) relation.
1087
1088	my %suffixtophind;
1089	my @totalfrequency;
1090	my (@fields, $suffixid);
1091	my $nextphind = 1;
1092
1093	open(IN, "<$infile");
1094	while(<IN>) {
1095
1096	chomp;
1097	@fields = split(/:/, $_);
1098
1099	# get next suffixid and phindid
1100	$suffixid = shift @fields;
1101	$suffixtophind{$suffixid} = $nextphind;
1102
1103	# store total frequency
1104	shift @fields;
1105	$totalfrequency[$nextphind] = shift @fields;
1106
1107	$nextphind++;
1108	}
1109	close IN;
1110
1111
1112	# Translate phrases file to phrases.2. Use phind keys (not suffix
1113	# keys), sort expansion and document occurance data in order of
1114	# descending frequency..
1115	open(IN, "<$infile");
1116	open(OUT, ">$outfile");
1117
1118	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1119	my (@documwents, @newexp, $k, $n);
1120	my $linenumber = 0;
1121
1122	while(<IN>) {
1123
1124	# read the line
1125	chomp;
1126	@fields = split(/:/, $_);
1127
1128	# get a phrase number for this line
1129	$suffixid = shift @fields;
1130	die unless (defined($suffixtophind{$suffixid}));
1131	$phindid = $suffixtophind{$suffixid};
1132
1133	# get the symbols in the phrase
1134	$text = shift @fields;
1135
1136	# output status information
1137	$linenumber++;
1138	if ($verbosity > 2) {
1139	if ($linenumber % 1000 == 0) {
1140	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1141	}
1142	# what are $num and $key??
1143	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1144	}
1145
1146	# get the phrase frequency
1147	$tf = shift @fields;
1148
1149	# get the number of expansions
1150	$countexp = shift @fields;
1151
1152	# get the expansions, convert them into phind-id numbers, and sort them
1153	$expansions = shift @fields;
1154	@newexp = ();
1155	foreach $k (split(/,/, $expansions)) {
1156	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1157	$n = $suffixtophind{$k};
1158	push @newexp, $n;
1159	}
1160	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1161
1162	# get the number of documents
1163	$countdocs = shift @fields;
1164
1165	# get the documents and sort them
1166	$documents = shift @fields;
1167	$documents =~ s/d//g;
1168	my @documents = split(/;/, $documents);
1169	@documents = sort by_doc_frequency @documents;
1170
1171	# output the phrase data
1172	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1173	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1174
1175	}
1176
1177	close IN;
1178	close OUT;
1179	}
1180
1181
1182	# renumber_thesaurus_data
1183	#
1184	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1185
1186	sub renumber_thesaurus_data {
1187	my ($self) = @_;
1188
1189	my $out = $self->{'outhandle'};
1190	my $verbosity = $self->{'verbosity'};
1191	my $thesaurus = $self->{'thesaurus'};
1192
1193	my $phinddir = $self->{'phinddir'};
1194	my $infile = &util::filename_cat($phinddir, "phrases.2");
1195	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1196
1197
1198	# If no thesaurus is defined, simply move the phrases file.
1199	if (!$thesaurus) {
1200	print $out "Translate phrases.2: no thesaurus data\n"
1201	if ($verbosity);
1202	&util::mv($infile, $outfile);
1203	return;
1204	}
1205
1206	print $out "Translate phrases.2: add thesaurus data\n"
1207	if ($verbosity);
1208
1209	# 1.
1210	# Read thesaurus file and store (symbols->thesaurusid) mapping
1211	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1212	my %symbolstothesid;
1213	my (@fields, $thesid, $symbols);
1214
1215	open(TH, "<$thesaurusfile");
1216
1217	while (<TH>) {
1218
1219	chomp;
1220	@fields = split(/:/, $_);
1221
1222	# get id and text
1223	$thesid = shift @fields;
1224	$symbols = shift @fields;
1225	$symbolstothesid{$symbols} = $thesid;
1226	}
1227	close TH;
1228
1229	# 2.
1230	# Read phrases file to find thesaurus entries that already
1231	# have a phindid. Store their phind-ids for later translation,
1232	# and store their frequency for later sorting.
1233	my %thesaurustophindid;
1234	my %phindidtofrequency;
1235	my ($phindid, $freq);
1236
1237	open(IN, "<$infile");
1238
1239	while(<IN>) {
1240
1241	chomp;
1242	@fields = split(/:/, $_);
1243
1244	# phindid and symbols for this line
1245	$phindid = shift @fields;
1246	$symbols = shift @fields;
1247	$freq = shift @fields;
1248
1249	# do we have a thesaurus id corresponding to this phrase?
1250	if (defined($symbolstothesid{$symbols})) {
1251	$thesid = $symbolstothesid{$symbols};
1252	$thesaurustophindid{$thesid} = $phindid;
1253	$phindidtofrequency{$phindid} = $freq;
1254	}
1255	}
1256	close IN;
1257
1258	undef %symbolstothesid;
1259
1260	# 3.
1261	# Create phind-id numbers for remaining thesaurus entries,
1262	# and note that their frequency is 0 for later sorting.
1263	my $nextphindid = $phindid + 1;
1264
1265	open(TH, "<$thesaurusfile");
1266	while(<TH>) {
1267
1268	chomp;
1269	@fields = split(/:/, $_);
1270
1271	# read thesaurus-id and ensure it has a corresponding phind-id
1272	$thesid = shift @fields;
1273	if (!defined($thesaurustophindid{$thesid})) {
1274	$thesaurustophindid{$thesid} = $nextphindid;
1275	$phindidtofrequency{$nextphindid} = 0;
1276	$nextphindid++;
1277	}
1278	}
1279	close TH;
1280
1281	# 4.
1282	# Translate thesaurus file, replacing thesaurus-id numbers with
1283	# phind-id numbers.
1284	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1285	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1286	my (@links, $linkid, %linkidtotype, $newrelation);
1287
1288	open(TH, "<$thesaurusfile");
1289	open(TO, ">$newthesaurusfile");
1290	while(<TH>) {
1291
1292	chomp;
1293	@fields = split(/:/, $_);
1294
1295	# phindid and symbols for this line
1296	($thesid, $symbols, $relations) = @fields;
1297
1298	die unless ($thesid && $symbols);
1299	die unless $thesaurustophindid{$thesid};
1300	$phindid = $thesaurustophindid{$thesid};
1301
1302	# convert each part of the relation string to use phind-id numbers
1303	# at the same time, we want to sort the list by frequency.
1304	undef %linkidtotype;
1305
1306	foreach $linktext (split(/;/, $relations)) {
1307	@linkdata = split(/,/, $linktext);
1308
1309	# remember the linktype (e.g. BT, NT)
1310	$linktype = shift @linkdata;
1311
1312	# store the type of each link
1313	foreach $thesid (@linkdata) {
1314	die unless (defined($thesaurustophindid{$thesid}));
1315	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1316	}
1317	}
1318
1319	# sort the list of links, first by frequency, then by type.
1320	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1321	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1322	$linkcounter = (scalar @links);
1323
1324	# create a string describing the link information
1325	$linktype = $linkidtotype{$links[0]};
1326	$newrelation = $linktype;
1327	foreach $linkid (@links) {
1328	if ($linkidtotype{$linkid} ne $linktype) {
1329	$linktype = $linkidtotype{$linkid};
1330	$newrelation .= ";" . $linktype;
1331	}
1332	$newrelation .= "," . $linkid;
1333	}
1334	$newrelation .= ";";
1335
1336
1337	# output the new line
1338	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1339	}
1340	close TH;
1341	close TO;
1342
1343	undef %thesaurustophindid;
1344	undef %linkidtotype;
1345	undef %phindidtofrequency;
1346
1347	# 5.
1348	# Read thesaurus data (in phind-id format) into memory
1349	my %thesaurusdata;
1350
1351	open(TH, "<$newthesaurusfile");
1352	while(<TH>) {
1353	chomp;
1354	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1355	die unless ($phindid && $symbols);
1356	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1357	}
1358	close TH;
1359
1360	# 6.
1361	# Add thesaurus data to phrases file
1362	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1363	my (@documwents, @newexp, $k, $n);
1364	my $linenumber = 0;
1365
1366	open(IN, "<$infile");
1367	open(OUT, ">$outfile");
1368
1369	# Update existing phrases
1370	while(<IN>) {
1371
1372	chomp;
1373	@fields = split(/:/, $_);
1374
1375	# get data for this line
1376	$phindid = shift @fields;
1377
1378	# output the phrase data, with thesaurus information
1379	print OUT "$phindid:", join(":", @fields);
1380
1381	# add thesaurus data
1382	if (defined($thesaurusdata{$phindid})) {
1383	@fields = split(/:/, $thesaurusdata{$phindid});
1384	shift @fields;
1385	$linkcounter = shift @fields;
1386	$relations = shift @fields;
1387
1388	print OUT ":$linkcounter:$relations";
1389	$thesaurusdata{$phindid} = "";
1390	}
1391	print OUT "\n";
1392	}
1393	close IN;
1394
1395	# Add phrases that aren't already in the file
1396	foreach $phindid (sort numerically keys %thesaurusdata) {
1397	next unless ($thesaurusdata{$phindid});
1398
1399	@fields = split(/:/, $thesaurusdata{$phindid});
1400	$symbols = shift @fields;
1401	$linkcounter = shift @fields;
1402	$relations = shift @fields;
1403
1404	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1405	}
1406	close OUT;
1407
1408	}
1409
1410	# restore_vocabulary_data
1411	#
1412	# Read phrases.3 and restore vocabulary information. Then write
1413	# this data to the MGPP input files (pword.txt and pdata.txt) and
1414	# (if requested) to the saved phrases file.
1415
1416	sub restore_vocabulary_data {
1417	my ($self) = @_;
1418
1419	my $out = $self->{'outhandle'};
1420	my $verbosity = $self->{'verbosity'};
1421	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1422
1423	my $phinddir = $self->{'phinddir'};
1424	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1425	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1426	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1427	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1428
1429	my $savephrases = $self->{'savephrases'};
1430
1431	# 1.
1432	# Read the vocabulary file
1433	open(V, "<$vocabfile")
1434	\|\| die "Cannot open $vocabfile: $!";
1435	my @symbol;
1436	my $i = 1;
1437	while(<V>) {
1438	chomp;
1439	$symbol[$i++] = $_;
1440	}
1441	close V;
1442
1443	# 2.
1444	# Translate phrases.3 to MGPP input files
1445	my ($key, $text, $word, $isThesaurus, $line);
1446	my @fields;
1447	my $linenumber = 0;
1448
1449	open(IN, "<$infile");
1450	open(DATA, ">$datafile");
1451	open(WORD, ">$wordfile");
1452
1453	# Save the phrases in a separate text file
1454	if ($savephrases) {
1455	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1456	open(SAVE, ">$savephrases");
1457	}
1458
1459	while(<IN>) {
1460
1461	# read the line
1462	chomp;
1463	$line = $_;
1464	@fields = split(/:/, $line);
1465
1466	# get a phrase number for this line
1467	$key = shift @fields;
1468
1469	# restore the text of the phrase
1470	$text = shift @fields;
1471	$text =~ s/s(\d+)/$symbol[$1]/g;
1472	if ($text =~ / /) {
1473	$word = "";
1474	} elsif ($text ne 'untranslated') {
1475	$word = $text;
1476	}
1477
1478	# output the phrase data
1479	print DATA "<Document>";
1480	print DATA "$key:$text:", join(":", @fields), ":\n";
1481
1482	# output the word index search data
1483	print WORD "<Document>$word\n";
1484
1485	# output the phrases to a text file
1486	if ($savephrases) {
1487	if ((scalar @fields) == 7) {
1488	$isThesaurus = 1;
1489	} else {
1490	$isThesaurus = 0;
1491	}
1492	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1493	}
1494	}
1495	close IN;
1496	close WORD;
1497	close DATA;
1498	close SAVE if ($savephrases);
1499
1500	}
1501
1502
1503
1504	# sort routines used to renumber phrases
1505
1506	sub numerically { $a <=> $b }
1507
1508	sub by_doc_frequency {
1509	my $fa = 1;
1510	if ($a =~ /,/) {
1511	$fa = $a;
1512	$fa =~ s/\d+,//;
1513	}
1514	my $fb = 1;
1515	if ($b =~ /,/) {
1516	$fb = $b;
1517	$fb =~ s/\d+,//;
1518	}
1519
1520	return ($fb <=> $fa);
1521	}
1522
1523	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: