Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 21876

Last change on this file since 21876 was 21876, checked in by kjdon, 14 years ago
only process into english clauses if english is the only language, not for eg with ar\|en. Don't remove all non \w characters - this removes all non alphanumeric chars. I have made up a punctuation match, some replaced with new lines, some with space.
Property svn:keywords set to `Author Date Id Revision`
File size: 40.8 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	package Phind;
33
34	use BaseClassifier;
35	use util;
36	use ghtml;
37	use unicode;
38
39	use strict;
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	my @removedirs = ();
43
44	my %wanted_index_files = ('td'=>1,
45	't'=>1,
46	'ti'=>1,
47	'tl'=>1,
48	'tsd'=>1,
49	'idb'=>1,
50	'ib1'=>1,
51	'ib2'=>1,
52	'ib3'=>1,
53	'i'=>1,
54	'il'=>1,
55	'w'=>1,
56	'wa'=>1);
57
58	sub BEGIN {
59	@Phind::ISA = ('BaseClassifier');
60	}
61
62	sub END {
63
64	# Tidy up stray files - we do this here as there's some weird problem
65	# preventing us from doing it in the get_classify_info() function (on
66	# windows at least) where the close() appears to fail on txthandle and
67	# dochandle, thus preventing us from deleting those files
68
69	foreach my $dir (@removedirs) {
70	if (-d $dir && opendir (DIR, $dir)) {
71	my @files = readdir DIR;
72	closedir DIR;
73
74	foreach my $file (@files) {
75	next if $file =~ /^\.\.?$/;
76	my ($suffix) = $file =~ /\.([^\.]+)$/;
77	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
78	# delete it!
79	&util::rm (&util::filename_cat ($dir, $file));
80	}
81	}
82	}
83	}
84	}
85
86	my $arguments =
87	[ { 'name' => "text",
88	'desc' => "{Phind.text}",
89	'type' => "string",
90	'deft' => "section:Title,section:text",
91	'reqd' => "no" },
92	{ 'name' => "title",
93	'desc' => "{Phind.title}",
94	'type' => "metadata",
95	'deft' => "Title",
96	'reqd' => "no" },
97	{ 'name' => "buttonname",
98	'desc' => "{BasClas.buttonname}",
99	'type' => "string",
100	'deft' => "Phrase",
101	'reqd' => "no" },
102	{ 'name' => "language",
103	'desc' => "{Phind.language}",
104	'type' => "string",
105	'deft' => "en",
106	'reqd' => "no" },
107	{ 'name' => "savephrases",
108	'desc' => "{Phind.savephrases}",
109	'type' => "string",
110	'deft' => "",
111	'reqd' => "no" },
112	{ 'name' => "suffixmode",
113	'desc' => "{Phind.suffixmode}",
114	'type' => "int",
115	'deft' => "1",
116	'range' => "0,1",
117	'reqd' => "no" },
118	{ 'name' => "min_occurs",
119	'desc' => "{Phind.min_occurs}",
120	'type' => "int",
121	'deft' => "2",
122	'range' => "1,",
123	'reqd' => "no" },
124	{ 'name' => "thesaurus",
125	'desc' => "{Phind.thesaurus}",
126	'type' => "string",
127	'deft' => "",
128	'reqd' => "no" },
129	{ 'name' => "untidy",
130	'desc' => "{Phind.untidy}",
131	'type' => "flag",
132	'reqd' => "no" } ];
133
134	my $options = { 'name' => "Phind",
135	'desc' => "{Phind.desc}",
136	'abstract' => "no",
137	'inherits' => "yes",
138	'args' => $arguments };
139
140
141	# Phrase delimiter symbols - these should be abstracted out someplace
142
143	my $colstart = "COLLECTIONSTART";
144	my $colend = "COLLECTIONEND";
145	my $doclimit = "DOCUMENTLIMIT";
146	my $senlimit = "SENTENCELIMIT";
147	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150	# Create a new Phind browser based on collect.cfg
151
152	sub new {
153	my ($class) = shift (@_);
154	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155	push(@$classifierslist, $class);
156
157	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158	push(@{$hashArgOptLists->{"OptList"}},$options);
159
160	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
161
162	if ($self->{'info_only'}) {
163	# don't worry about any options etc
164	return bless $self, $class;
165	}
166
167	# Ensure the Phind generate scripts are in place
168	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171	if (!(-e $file1)) {
172	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173	exit(1);
174	}
175
176	# things that may have ex. in them that need to be stripped off
177	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
180	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
181
182	$self->{"indexes"} = $self->{"text"};
183
184	# Further setup
185	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
187	if (! defined $self->{'builddir'}) {
188	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189	}
190	$self->{'total'} = 0;
191
192	# Clean out the unused keys
193	delete $self->{"text"};
194
195	return bless $self, $class;
196	}
197
198
199	# Initialise the Phind classifier
200
201	sub init {
202	my $self = shift (@_);
203
204	# ensure we have a build directory
205	my $builddir = $self->{'builddir'};
206	die unless (-e "$builddir");
207
208	# create Phind directory
209	my $phnumber = 1;
210	my $phinddir = &util::filename_cat($builddir, "phind1");
211	while (-e "$phinddir") {
212	$phnumber++;
213	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
214	}
215	&util::mk_dir("$phinddir");
216	$self->{'phinddir'} = $phinddir;
217	$self->{'phindnumber'} = $phnumber;
218
219	push(@removedirs, $phinddir) unless $self->{'untidy'};
220
221	# open filehandles for documents and text
222	my $clausefile = &util::filename_cat("$phinddir", "clauses");
223	&util::rm($clausefile) if (-e $clausefile);
224
225	my $txthandle = 'TEXT' . $phnumber;
226	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
227	$self->{'txthandle'} = $txthandle;
228
229	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
230	&util::rm($docfile) if (-e $docfile);
231
232	my $dochandle = 'DOC' . $phnumber;
233	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
234	$self->{'dochandle'} = $dochandle;
235
236	}
237
238
239	# Classify each document.
240	#
241	# Each document is passed here in turn. The classifier extracts the
242	# text of each and stores it in the clauses file. Document details are
243	# stored in the docs.txt file.
244
245	sub classify {
246	my $self = shift (@_);
247	my ($doc_obj,$edit_mode) = @_;
248
249	my $verbosity = $self->{'verbosity'};
250	my $top_section = $doc_obj->get_top_section();
251
252	my $titlefield = $self->{'title'};
253
254	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
255	if (!defined($title)) {
256	$title = "";
257	print STDERR "Phind: document has no title\n";
258	}
259	print "process: $title\n" if ($verbosity > 2);
260
261	# Only consider the file if it is in the correct language
262	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
263	my $phrlanguage = $self->{'language'};
264	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
265
266	if ($edit_mode eq "delete") {
267	# This classifier works quite differently to the others
268	# Probably doesn't support incremental building anyway
269	return;
270	}
271
272	# record this file
273	$self->{'total'} ++;
274	# what is $file ???
275	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
276
277	# Store document details
278	my $OID = $doc_obj->get_OID();
279	$OID = "NULL" unless defined $OID;
280	my $dochandle = $self->{'dochandle'};
281	print $dochandle "<Document>\t$OID\t$title\n";
282
283	# Store the text occuring in this object
284
285	# output the document delimiter
286	my $txthandle = $self->{'txthandle'};
287	print $txthandle "$doclimit\n";
288
289	# iterate over the required indexes and store their text
290	my $indexes = $self->{'indexes'};
291	my $text = "";
292	my ($part, $level, $field, $section, $data, $dataref);
293
294	foreach $part (split(/,/, $indexes)) {
295
296	# Each field has a level and a data element ((e.g. document:Title)
297	($level, $field) = split(/:/, $part);
298	die unless ($level && $field);
299
300	# Extract the text from every section
301	# (In phind, document:text and section:text are equivalent)
302	if ($field eq "text") {
303	$data = "";
304	$section = $doc_obj->get_top_section();
305	while (defined($section)) {
306	$data .= $doc_obj->get_text($section) . "\n";
307	$section = $doc_obj->get_next_section($section);
308	}
309	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
310	}
311
312	# Extract a metadata field from a document
313	# (If there is more than one element of the given type, get them all.)
314	elsif ($level eq "document") {
315	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
316	foreach $data (@$dataref) {
317	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
318	}
319	}
320
321	# Extract metadata from every section in a document
322	elsif ($level eq "section") {
323	$data = "";
324	$section = $doc_obj->get_top_section();
325	while (defined($section)) {
326	$dataref = $doc_obj->get_metadata($section, $field);
327	$data .= join("\n", @$dataref) . "\n";
328	$section = $doc_obj->get_next_section($section);
329	}
330	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
331	}
332
333	# Some sort of specification which I don't understand
334	else {
335	die "Unknown level ($level) in Phind index ($part)\n";
336	}
337
338	}
339
340	# output the text
341	$text =~ tr/\n//s;
342	print $txthandle "$text";
343	}
344
345
346	# Construct the classifier from the information already gathered
347	#
348	# When get_classify_info is called, the clauses and docs.txt files have
349	# already been constructed in the Phind directory. This function will
350	# translate them into compressed, indexed MGPP files that can be read by
351	# the phindcgi script. It will also register our classifier so that it
352	# shows up in the navigation bar.
353
354	sub get_classify_info {
355	my $self = shift (@_);
356	my ($gli) = @_;
357
358	close $self->{'dochandle'};
359	close $self->{'txthandle'};
360	my $verbosity = $self->{'verbosity'};
361	my $out = $self->{'outhandle'};
362	my $phinddir = $self->{'phinddir'};
363
364	my $osextra = "";
365	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
366	$osextra = " -d /";
367	}
368
369	print STDERR "</Stage>\n" if $gli;
370
371	if ($verbosity) {
372	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
373	print $out "*** in ", $self->{'phinddir'}, "\n";
374	}
375
376	print STDERR "<Stage name='Phind'>\n" if $gli;
377
378	# Construct phind indexes
379	my $suffixmode = $self->{'suffixmode'};
380	my $min_occurs = $self->{'min_occurs'};
381	my ($command, $status);
382
383	# Generate the vocabulary, symbol statistics, and numbers file
384	# from the clauses file
385	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
386	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
387	&extract_vocabulary($self);
388
389	# Use the suffix program to generate the phind/phrases file
390	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
391	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
392	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
393
394	# check that we generated some files. It's not necessarily an error if
395	# we didn't (execute() would have quit on error), but we can't go on.
396	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
397	if (! -r $phrasesfile) {
398	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
399	print $out "\nNo phrases found for Phind classifier!\n";
400	return;
401	}
402
403	# Create the phrase file and put phrase numbers in phind/phrases
404	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
405	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
406	&renumber_phrases($self);
407
408	print $out "\nCreating phrase databases\n";
409	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
410	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
411	my $mg_stem = &util::filename_cat($phinddir, "pdata");
412
413	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
414	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
415	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
416
417	# create the mg index of words
418	print $out "\nCreating word-level search indexes\n";
419	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
420	$mg_input = &util::filename_cat($phinddir, "pword.txt");
421	$mg_stem = &util::filename_cat($phinddir, "pword");
422
423	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
424	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
425	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
427	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
428	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
429
430	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
431	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
432	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
433
434	# create the mg document information database
435	print $out "\nCreating document information databases\n";
436	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
437	$mg_input = &util::filename_cat($phinddir, "docs.txt");
438	$mg_stem = &util::filename_cat($phinddir, "docs");
439
440	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
441	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
442	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
443
444	# Return the information about the classifier that we'll later want to
445	# use to create macros when the Phind classifier document is displayed.
446	my %classifyinfo = ('thistype'=>'Invisible',
447	'childtype'=>'Phind',
448	'Title'=>$self->{'buttonname'},
449	'parameters'=>"phindnumber=$self->{'phindnumber'}",
450	'contains'=>[]);
451
452	my $collection = $self->{'collection'};
453	my $url = "library?a=p&p=phind&c=$collection";
454	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
455
456	return \%classifyinfo;
457	}
458
459
460
461	sub convert_gml_to_tokens {
462
463	my ($language_exp, $text) = @_;
464
465	# escape any magic words... - jrm21
466	foreach my $delim (@delimiters) {
467	my $replacement=lc($delim);
468	my $num= $text=~ s/$delim/$replacement/g;
469	if (!$num) {$num=0;}
470	}
471
472	if ($language_exp =~ /^en$/) {
473	return &convert_gml_to_tokens_EN($text);
474	}
475
476	if ($language_exp =~ /zh/) {
477	return &convert_gml_to_tokens_ZH($text);
478	}
479
480	$_ = $text;
481
482	# 1. remove GML tags
483
484	# Remove everything that is in a tag
485	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
486	s/\s<br>\s/ LINEBREAK /isgo;
487	s/<[^>]*>/ /sgo;
488
489	# Now we have the text, but it may contain HTML
490	# elements coded as > etc. Remove these tags.
491	s/&/&/sgo;
492	s/</</sgo;
493	s/>/>/sgo;
494	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
495	s/\s<br>\s/ LINEBREAK /isgo;
496	s/<[^>]*>/ /sgo;
497
498	# replace<p> and <br> placeholders with clause break symbol (\n)
499	s/\s+/ /gso;
500	s/PARAGRAPHBREAK/\n/sgo;
501	s/LINEBREAK/\n/sgo;
502
503
504	# 2. Split the remaining text into space-delimited tokens
505
506	# Convert any HTML special characters (like ") to their UTF8 equivalent
507	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
508
509	# Split text at word boundaries
510	s/\b/ /go;
511
512	# 3. Convert the remaining text to "clause format"
513
514	# Insert newline if the end of a sentence is detected
515	# (delimter is: "[\.\?\!]\s")
516	# s/\s*[\.\?\!]\s+/\n/go;
517
518	# remove unnecessary punctuation and replace with clause break symbol (\n)
519	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
520	#s/[^\w ]/\n/go;
521	# replace punct with new lines - is this what we want??
522	s/\s[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s/\n/go; #"
523	# then remove other punct with space
524	s/[\'\`\\\_]/ /go;
525
526	# remove extraneous whitespace
527	s/ +/ /sgo;
528	s/^\s+//mgo;
529	s/\s*$/\n/mgo;
530
531	# remove lines that contain one word or less
532	s/^\S*$//mgo;
533	s/^\s*$//mgo;
534	tr/\n//s;
535
536	return $_;
537	}
538
539	# a chinese version
540	sub convert_gml_to_tokens_ZH {
541
542	$_ = shift @_;
543
544	# Replace all whitespace with a simple space
545	s/\s+/ /gs;
546	# Remove everything that is in a tag
547	s/\s<p>\s/ PARAGRAPHBREAK /isg;
548	s/\s<br>\s/ LINEBREAK /isg;
549	s/<[^>]*>/ /sg;
550
551	# Now we have the text, but it may contain HTML
552	# elements coded as > etc. Remove these tags.
553	s/</</sg;
554	s/>/>/sg;
555
556	s/\s+/ /sg;
557	s/\s<p>\s/ PARAGRAPHBREAK /isg;
558	s/\s<br>\s/ LINEBREAK /isg;
559	s/<[^>]*>/ /sg;
560
561	# remove & and other miscellaneous markup tags
562	s/&/&/sg;
563	s/</</sg;
564	s/>/>/sg;
565	s/&/&/sg;
566
567	# replace<p> and <br> placeholders with carriage returns
568	s/PARAGRAPHBREAK/\n/sg;
569	s/LINEBREAK/\n/sg;
570
571
572	# print STDERR "text:$_\n";
573	return $_;
574	}
575
576	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
577
578	sub convert_gml_to_tokens_EN {
579	$_ = shift @_;
580
581	# FIRST, remove GML tags
582
583	# Replace all whitespace with a simple space
584	s/\s+/ /gs;
585
586	# Remove everything that is in a tag
587	s/\s<p>\s/ PARAGRAPHBREAK /isg;
588	s/\s<br>\s/ LINEBREAK /isg;
589	s/<[^>]*>/ /sg;
590
591	# Now we have the text, but it may contain HTML
592	# elements coded as > etc. Remove these tags.
593	s/</</sg;
594	s/>/>/sg;
595
596	s/\s+/ /sg;
597	s/\s<p>\s/ PARAGRAPHBREAK /isg;
598	s/\s<br>\s/ LINEBREAK /isg;
599	s/<[^>]*>/ /sg;
600
601	# remove & and other miscellaneous markup tags
602	s/&/&/sg;
603	s/</</sg;
604	s/>/>/sg;
605	s/&/&/sg;
606
607	# replace<p> and <br> placeholders with carriage returns
608	s/PARAGRAPHBREAK/\n/sg;
609	s/LINEBREAK/\n/sg;
610
611
612	# Exceptional punctuation
613	#
614	# We make special cases of some punctuation
615
616	# remove any apostrophe that indicates omitted letters
617	s/(\w+)\'(\w*\s)/ $1$2 /g;
618
619	# remove period that appears in a person's initals
620	s/\s([A-Z])\./ $1 /g;
621
622	# replace hyphens in hypheanted words and names with a space
623	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
624
625	# Convert the remaining text to "clause format",
626	# This means removing all excess punctuation and garbage text,
627	# normalising valid punctuation to fullstops and commas,
628	# then putting one cluse on each line.
629
630	# Insert newline when the end of a sentence is detected
631	# (delimter is: "[\.\?\!]\s")
632	s/\s*[\.\?\!]\s+/\n/g;
633
634	# split numbers after four digits
635	s/(\d\d\d\d)/$1 /g;
636
637	# split words after 32 characters
638
639	# squash repeated punctuation
640	tr/A-Za-z0-9 //cs;
641
642	# save email addresses
643	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
644
645	# normalise clause breaks (mostly punctuation symbols) to commas
646	s/[^A-Za-z0-9 \n]+/ , /g;
647
648	# Remove repeated commas, and replace with newline
649	s/\s*,[, ]+/\n/g;
650
651	# remove extra whitespace
652	s/ +/ /sg;
653	s/^\s+//mg;
654	s/\s*$/\n/mg;
655
656	# remove lines that contain one word or less
657	s/^\w*$//mg;
658	s/^\s*$//mg;
659	tr/\n//s;
660
661	return $_;
662
663	}
664
665
666
667	# Execute a system command
668
669	sub execute {
670	my ($command, $verbosity, $outhandle) = @_;
671	print $outhandle "Executing: $command\n" if ($verbosity > 2);
672	$! = 0;
673	my $status = system($command);
674	if ($status != 0) {
675	print STDERR "Phind - Error executing '$command': $!\n";
676	exit($status); # this causes the build to fail...
677	}
678	}
679
680
681	# Generate the vocabulary, symbol statistics, and numbers file from the
682	# clauses file. This is legacy code, so is a bit messy and probably wont
683	# run under windows.
684
685	sub extract_vocabulary {
686	my ($self) = @_;
687
688	my $verbosity = $self->{'verbosity'};
689	my $out = $self->{'outhandle'};
690
691	my $collectiondir = $self->{'collectiondir'};
692	my $phinddir = $self->{'phinddir'};
693
694	my $language_exp = $self->{'language'};
695
696	my ($w, $l, $line, $word);
697
698	my ($first_delimiter, $last_delimiter,
699	$first_stopword, $last_stopword,
700	$first_extractword, $last_extractword,
701	$first_contentword, $last_contentword,
702	$phrasedelimiter);
703
704	my $thesaurus = $self->{'thesaurus'};
705	my ($thesaurus_links, $thesaurus_terms,
706	%thesaurus, $first_thesaurusword, $last_thesaurusword);
707
708	my %symbol;
709	my (%freq);
710
711	print $out "Calculating vocabulary\n" if ($verbosity > 1);
712
713	# Read and store the stopwords
714	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
715	my $stopword_files = ();
716	my ($language, $language_dir, $file, $file_name);
717	my %stopwords;
718
719	# Examine each directory in the stopword directory
720	opendir(STOPDIR, $stopdir);
721	foreach $language (readdir STOPDIR) {
722
723	# Ignore entries that do not match the classifier's language
724	next unless ($language =~ /$language_exp/);
725	$language_dir = &util::filename_cat($stopdir, $language);
726	next unless (-d "$language_dir");
727
728	opendir(LANGDIR, $language_dir);
729	foreach $file (readdir LANGDIR) {
730
731	# Ignore entries that are not stopword files
732	next unless ($file =~ /sw$/);
733	$file_name = &util::filename_cat($language_dir, $file);
734	next unless (-f "$file_name");
735
736	# Read the stopwords
737	open(STOPFILE, "<$file_name");
738	while (<STOPFILE>) {
739	s/^\s+//;
740	s/\s.*//;
741	$word = $_;
742	$l = lc($word);
743	$stopwords{$l} = $word;
744	}
745	close STOPFILE;
746
747	}
748	closedir LANGDIR;
749	}
750	closedir STOPDIR;
751
752	# Read thesaurus information
753	if ($thesaurus) {
754
755	# link file exists
756	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
757	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
758
759	# ensure term file exists in the correct language
760	if ($language_exp =~ /^([a-z][a-z])/) {
761	$language = $1;
762	} else {
763	$language = 'en';
764	}
765	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
766	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
767
768
769	# Read the thesaurus terms
770	open(TH, "<$thesaurus_terms");
771	while(<TH>) {
772	s/^\d+ //;
773	s/\(.*\)//;
774	foreach $w (split(/\s+/, $_)) {
775	$thesaurus{lc($w)} = $w;
776	}
777	}
778	close TH;
779	}
780
781	# Read words in the text and count occurences
782	open(TXT, "<$phinddir/clauses");
783
784	my @words;
785	while(<TXT>) {
786	$line = $_;
787	next unless ($line =~ /./);
788
789	@words = split(/\s+/, $line);
790	foreach $w (@words) {
791	$l = lc($w);
792	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
793	$freq{$w}++;
794	}
795	$freq{$senlimit}++;
796	}
797
798	close TXT;
799
800	# Calculate the "best" form of each word
801	my (%bestform, %totalfreq, %bestfreq);
802
803	foreach $w (sort (keys %freq)) {
804	$l = lc($w);
805
806	# totalfreq is the number of times a term appears in any form
807	$totalfreq{$l} += $freq{$w};
808
809	if (defined $stopwords{$l}) {
810	$bestform{$l} = $stopwords{$l};
811
812	} elsif (defined $thesaurus{$l}) {
813	$bestform{$l} = $thesaurus{$l};
814
815	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
816	$bestfreq{$l} = $freq{$w};
817	$bestform{$l} = $w;
818	}
819	}
820	undef %freq;
821	undef %bestfreq;
822
823
824	# Assign symbol numbers to tokens
825	my $nextsymbol = 1;
826	my (@vocab);
827
828	# Delimiters
829	$first_delimiter = 1;
830
831	foreach $word (@delimiters) {
832
833	# $word = lc($word); # jrm21
834	$word = uc($word);
835	$bestform{$word} = $word;
836	$vocab[$nextsymbol] = $word;
837	$symbol{$word} = $nextsymbol;
838	$nextsymbol++;
839	}
840	$last_delimiter = $nextsymbol - 1;
841	# Stopwords
842	$first_stopword = $nextsymbol;
843
844	foreach my $word (sort keys %stopwords) {
845	# don't include stopword unless it occurs in the text
846	$word = lc($word);
847	next unless ($totalfreq{$word});
848	next if ($symbol{$word});
849
850	$vocab[$nextsymbol] = $word;
851	$symbol{$word} = $nextsymbol;
852	$nextsymbol++;
853	}
854	$last_stopword = $nextsymbol - 1;
855	$first_contentword = $nextsymbol;
856
857	# Thesaurus terms
858	if ($thesaurus) {
859	$first_thesaurusword = $nextsymbol;
860
861	foreach my $word (sort keys %thesaurus) {
862
863	$word = lc($word);
864	next if ($symbol{$word});
865	$bestform{$word} = $thesaurus{$word};
866
867	$vocab[$nextsymbol] = $word;
868	$symbol{$word} = $nextsymbol;
869	$nextsymbol++;
870
871	}
872	$last_thesaurusword = $nextsymbol - 1;
873	}
874
875	# Other content words
876	$first_extractword = $nextsymbol;
877
878	foreach my $word (sort (keys %bestform)) {
879
880	next if ($symbol{$word});
881
882	$vocab[$nextsymbol] = $word;
883	$symbol{$word} = $nextsymbol;
884	$nextsymbol++;
885	}
886	$last_extractword = $nextsymbol - 1;
887	$last_contentword = $nextsymbol - 1;
888
889	# Outut the words
890	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
891	open(VOC, ">$phinddir/clauses.vocab");
892
893	for (my $i = 1; $i < $nextsymbol; $i++) {
894	$w = $vocab[$i];
895
896	print VOC "$bestform{$w}\n";
897	$totalfreq{$w} = 0 unless ($totalfreq{$w});
898	}
899	close VOC;
900
901
902	# Create statistics file
903	# Output statistics about the vocablary
904	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
905	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
906
907	open(STAT, ">$phinddir/clauses.stats")
908	\|\| die "Cannot open $phinddir/clauses.stats: $!";
909
910	print STAT "first_delimiter $first_delimiter\n";
911	print STAT "last_delimiter $last_delimiter\n";
912	print STAT "first_stopword $first_stopword\n";
913	print STAT "last_stopword $last_stopword\n";
914	if ($thesaurus) {
915	print STAT "first_thesaurusword $first_thesaurusword\n";
916	print STAT "last_thesaurusword $last_thesaurusword\n";
917	}
918	print STAT "first_extractword $first_extractword\n";
919	print STAT "last_extractword $last_extractword\n";
920	print STAT "first_contentword $first_contentword\n";
921	print STAT "last_contentword $last_contentword\n";
922	print STAT "first_symbol $first_delimiter\n";
923	print STAT "last_symbol $last_contentword\n";
924	print STAT "first_word $first_stopword\n";
925	print STAT "last_word $last_contentword\n";
926	close STAT;
927
928	undef @vocab;
929
930
931	# Create numbers file
932	# Save text as symbol numbers
933	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
934
935	open(TXT, "<$phinddir/clauses");
936	open(NUM, ">$phinddir/clauses.numbers");
937
938	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
939	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
940	$phrasedelimiter = $symbol{$senlimit};
941	print NUM "$symbol{$colstart}\n";
942
943	# set up the special symbols that delimit documents and sentences
944	while(<TXT>) {
945
946	# split sentence into a list of tokens
947	$line = $_;
948	next unless ($line =~ /./);
949	@words = split(/\s+/, $line);
950
951	# output one token at a time
952	foreach $word (@words) {
953	# don't lower-case special delimiters - jrm21
954	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
955	$word = lc($word);
956	}
957	print NUM "$symbol{$word}\n";
958	}
959
960	# output phrase delimiter
961	print NUM "$phrasedelimiter\n";
962	}
963
964	close TXT;
965	# print NUM "$symbol{lc($colend)}\n";# jrm21
966	print NUM "$symbol{$colend}\n";
967	close NUM;
968
969	# Save thesaurus data in one convienient file
970	if ($thesaurus) {
971
972	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
973
974
975	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
976	if ($verbosity > 1);
977
978	# Read the thesaurus terms
979	my ($num, $text, %thes_symbols);
980
981	open(TH, "<$thesaurus_terms");
982	while(<TH>) {
983	chomp;
984	@words = split(/\s+/, $_);
985	$num = shift @words;
986	$text = "";
987
988	# translate words into symbol numbers
989	foreach $word (@words) {
990	$word = lc($word);
991	if ($symbol{$word}) {
992	$text .= "s$symbol{$word} ";
993	} elsif ($verbosity) {
994	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
995	}
996	}
997	$text =~ s/ $//;
998	$thes_symbols{$num} = $text;
999	}
1000	close TH;
1001
1002	# Read the thesaurus links and write the corresponding data
1003	open(TH, "<$thesaurus_links");
1004	open(THOUT, ">$thesaurusfile");
1005
1006	while(<TH>) {
1007	chomp;
1008	($num, $text) = split(/:/, $_);
1009
1010	if (defined($thes_symbols{$num})) {
1011	print THOUT "$num:$thes_symbols{$num}:$text\n";
1012	} else {
1013	print THOUT "$num:untranslated:$text\n";
1014	}
1015	}
1016	close TH;
1017	close THOUT;
1018	}
1019
1020
1021
1022
1023	}
1024
1025
1026	# renumber_phrases
1027	#
1028	# Prepare the phrases file to be input to mgpp. The biggest problem is
1029	# reconciling the phrase identifiers used by the suffix program (which
1030	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1031	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1032	# Phind-id numbers must be sorted by frequency of occurance.
1033	#
1034	# Start creating a set of phind-id numbers from the sorted suffix-id
1035	# numbers and (if required) the thesaurus-id numbers. Then add any other
1036	# phrases occuring in the thesaurus.
1037	#
1038	# The last thing we have to do is restore the vocabulary information to the
1039	# phrase file so that the phrases are stored as words, not as symbol
1040	# numbers.
1041
1042	# The original phrases file looks something like this:
1043	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1044	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1045	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1046	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1047
1048
1049	sub renumber_phrases {
1050	my ($self) = @_;
1051
1052	renumber_suffix_data($self);
1053	renumber_thesaurus_data($self);
1054	restore_vocabulary_data($self);
1055
1056	}
1057
1058
1059
1060	# renumber_suffix_data
1061	#
1062	# Translate phrases file to phrases.2 using phind keys instead
1063	# of suffix keys and sorting the expansion data.
1064
1065	sub renumber_suffix_data {
1066	my ($self) = @_;
1067
1068	my $verbosity = $self->{'verbosity'};
1069	my $out = $self->{'outhandle'};
1070	print $out "Translate phrases: suffix-ids become phind-id's\n"
1071	if ($verbosity);
1072
1073	my $phinddir = $self->{'phinddir'};
1074	my $infile = &util::filename_cat($phinddir, 'phrases');
1075	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1076
1077	# Read the phrase file. Calculate initial set of phind-id
1078	# numbers and store (suffixid -> frequency) relation.
1079
1080	my %suffixtophind;
1081	my @totalfrequency;
1082	my (@fields, $suffixid);
1083	my $nextphind = 1;
1084
1085	open(IN, "<$infile");
1086	while(<IN>) {
1087
1088	chomp;
1089	@fields = split(/:/, $_);
1090
1091	# get next suffixid and phindid
1092	$suffixid = shift @fields;
1093	$suffixtophind{$suffixid} = $nextphind;
1094
1095	# store total frequency
1096	shift @fields;
1097	$totalfrequency[$nextphind] = shift @fields;
1098
1099	$nextphind++;
1100	}
1101	close IN;
1102
1103
1104	# Translate phrases file to phrases.2. Use phind keys (not suffix
1105	# keys), sort expansion and document occurance data in order of
1106	# descending frequency..
1107	open(IN, "<$infile");
1108	open(OUT, ">$outfile");
1109
1110	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1111	my (@documwents, @newexp, $k, $n);
1112	my $linenumber = 0;
1113
1114	while(<IN>) {
1115
1116	# read the line
1117	chomp;
1118	@fields = split(/:/, $_);
1119
1120	# get a phrase number for this line
1121	$suffixid = shift @fields;
1122	die unless (defined($suffixtophind{$suffixid}));
1123	$phindid = $suffixtophind{$suffixid};
1124
1125	# get the symbols in the phrase
1126	$text = shift @fields;
1127
1128	# output status information
1129	$linenumber++;
1130	if ($verbosity > 2) {
1131	if ($linenumber % 1000 == 0) {
1132	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1133	}
1134	# what are $num and $key??
1135	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1136	}
1137
1138	# get the phrase frequency
1139	$tf = shift @fields;
1140
1141	# get the number of expansions
1142	$countexp = shift @fields;
1143
1144	# get the expansions, convert them into phind-id numbers, and sort them
1145	$expansions = shift @fields;
1146	@newexp = ();
1147	foreach $k (split(/,/, $expansions)) {
1148	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1149	$n = $suffixtophind{$k};
1150	push @newexp, $n;
1151	}
1152	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1153
1154	# get the number of documents
1155	$countdocs = shift @fields;
1156
1157	# get the documents and sort them
1158	$documents = shift @fields;
1159	$documents =~ s/d//g;
1160	my @documents = split(/;/, $documents);
1161	@documents = sort by_doc_frequency @documents;
1162
1163	# output the phrase data
1164	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1165	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1166
1167	}
1168
1169	close IN;
1170	close OUT;
1171	}
1172
1173
1174	# renumber_thesaurus_data
1175	#
1176	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1177
1178	sub renumber_thesaurus_data {
1179	my ($self) = @_;
1180
1181	my $out = $self->{'outhandle'};
1182	my $verbosity = $self->{'verbosity'};
1183	my $thesaurus = $self->{'thesaurus'};
1184
1185	my $phinddir = $self->{'phinddir'};
1186	my $infile = &util::filename_cat($phinddir, "phrases.2");
1187	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1188
1189
1190	# If no thesaurus is defined, simply move the phrases file.
1191	if (!$thesaurus) {
1192	print $out "Translate phrases.2: no thesaurus data\n"
1193	if ($verbosity);
1194	&util::mv($infile, $outfile);
1195	return;
1196	}
1197
1198	print $out "Translate phrases.2: add thesaurus data\n"
1199	if ($verbosity);
1200
1201	# 1.
1202	# Read thesaurus file and store (symbols->thesaurusid) mapping
1203	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1204	my %symbolstothesid;
1205	my (@fields, $thesid, $symbols);
1206
1207	open(TH, "<$thesaurusfile");
1208
1209	while (<TH>) {
1210
1211	chomp;
1212	@fields = split(/:/, $_);
1213
1214	# get id and text
1215	$thesid = shift @fields;
1216	$symbols = shift @fields;
1217	$symbolstothesid{$symbols} = $thesid;
1218	}
1219	close TH;
1220
1221	# 2.
1222	# Read phrases file to find thesaurus entries that already
1223	# have a phindid. Store their phind-ids for later translation,
1224	# and store their frequency for later sorting.
1225	my %thesaurustophindid;
1226	my %phindidtofrequency;
1227	my ($phindid, $freq);
1228
1229	open(IN, "<$infile");
1230
1231	while(<IN>) {
1232
1233	chomp;
1234	@fields = split(/:/, $_);
1235
1236	# phindid and symbols for this line
1237	$phindid = shift @fields;
1238	$symbols = shift @fields;
1239	$freq = shift @fields;
1240
1241	# do we have a thesaurus id corresponding to this phrase?
1242	if (defined($symbolstothesid{$symbols})) {
1243	$thesid = $symbolstothesid{$symbols};
1244	$thesaurustophindid{$thesid} = $phindid;
1245	$phindidtofrequency{$phindid} = $freq;
1246	}
1247	}
1248	close IN;
1249
1250	undef %symbolstothesid;
1251
1252	# 3.
1253	# Create phind-id numbers for remaining thesaurus entries,
1254	# and note that their frequency is 0 for later sorting.
1255	my $nextphindid = $phindid + 1;
1256
1257	open(TH, "<$thesaurusfile");
1258	while(<TH>) {
1259
1260	chomp;
1261	@fields = split(/:/, $_);
1262
1263	# read thesaurus-id and ensure it has a corresponding phind-id
1264	$thesid = shift @fields;
1265	if (!defined($thesaurustophindid{$thesid})) {
1266	$thesaurustophindid{$thesid} = $nextphindid;
1267	$phindidtofrequency{$nextphindid} = 0;
1268	$nextphindid++;
1269	}
1270	}
1271	close TH;
1272
1273	# 4.
1274	# Translate thesaurus file, replacing thesaurus-id numbers with
1275	# phind-id numbers.
1276	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1277	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1278	my (@links, $linkid, %linkidtotype, $newrelation);
1279
1280	open(TH, "<$thesaurusfile");
1281	open(TO, ">$newthesaurusfile");
1282	while(<TH>) {
1283
1284	chomp;
1285	@fields = split(/:/, $_);
1286
1287	# phindid and symbols for this line
1288	($thesid, $symbols, $relations) = @fields;
1289
1290	die unless ($thesid && $symbols);
1291	die unless $thesaurustophindid{$thesid};
1292	$phindid = $thesaurustophindid{$thesid};
1293
1294	# convert each part of the relation string to use phind-id numbers
1295	# at the same time, we want to sort the list by frequency.
1296	undef %linkidtotype;
1297
1298	foreach $linktext (split(/;/, $relations)) {
1299	@linkdata = split(/,/, $linktext);
1300
1301	# remember the linktype (e.g. BT, NT)
1302	$linktype = shift @linkdata;
1303
1304	# store the type of each link
1305	foreach $thesid (@linkdata) {
1306	die unless (defined($thesaurustophindid{$thesid}));
1307	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1308	}
1309	}
1310
1311	# sort the list of links, first by frequency, then by type.
1312	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1313	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1314	$linkcounter = (scalar @links);
1315
1316	# create a string describing the link information
1317	$linktype = $linkidtotype{$links[0]};
1318	$newrelation = $linktype;
1319	foreach $linkid (@links) {
1320	if ($linkidtotype{$linkid} ne $linktype) {
1321	$linktype = $linkidtotype{$linkid};
1322	$newrelation .= ";" . $linktype;
1323	}
1324	$newrelation .= "," . $linkid;
1325	}
1326	$newrelation .= ";";
1327
1328
1329	# output the new line
1330	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1331	}
1332	close TH;
1333	close TO;
1334
1335	undef %thesaurustophindid;
1336	undef %linkidtotype;
1337	undef %phindidtofrequency;
1338
1339	# 5.
1340	# Read thesaurus data (in phind-id format) into memory
1341	my %thesaurusdata;
1342
1343	open(TH, "<$newthesaurusfile");
1344	while(<TH>) {
1345	chomp;
1346	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1347	die unless ($phindid && $symbols);
1348	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1349	}
1350	close TH;
1351
1352	# 6.
1353	# Add thesaurus data to phrases file
1354	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1355	my (@documwents, @newexp, $k, $n);
1356	my $linenumber = 0;
1357
1358	open(IN, "<$infile");
1359	open(OUT, ">$outfile");
1360
1361	# Update existing phrases
1362	while(<IN>) {
1363
1364	chomp;
1365	@fields = split(/:/, $_);
1366
1367	# get data for this line
1368	$phindid = shift @fields;
1369
1370	# output the phrase data, with thesaurus information
1371	print OUT "$phindid:", join(":", @fields);
1372
1373	# add thesaurus data
1374	if (defined($thesaurusdata{$phindid})) {
1375	@fields = split(/:/, $thesaurusdata{$phindid});
1376	shift @fields;
1377	$linkcounter = shift @fields;
1378	$relations = shift @fields;
1379
1380	print OUT ":$linkcounter:$relations";
1381	$thesaurusdata{$phindid} = "";
1382	}
1383	print OUT "\n";
1384	}
1385	close IN;
1386
1387	# Add phrases that aren't already in the file
1388	foreach $phindid (sort numerically keys %thesaurusdata) {
1389	next unless ($thesaurusdata{$phindid});
1390
1391	@fields = split(/:/, $thesaurusdata{$phindid});
1392	$symbols = shift @fields;
1393	$linkcounter = shift @fields;
1394	$relations = shift @fields;
1395
1396	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1397	}
1398	close OUT;
1399
1400	}
1401
1402	# restore_vocabulary_data
1403	#
1404	# Read phrases.3 and restore vocabulary information. Then write
1405	# this data to the MGPP input files (pword.txt and pdata.txt) and
1406	# (if requested) to the saved phrases file.
1407
1408	sub restore_vocabulary_data {
1409	my ($self) = @_;
1410
1411	my $out = $self->{'outhandle'};
1412	my $verbosity = $self->{'verbosity'};
1413	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1414
1415	my $phinddir = $self->{'phinddir'};
1416	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1417	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1418	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1419	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1420
1421	my $savephrases = $self->{'savephrases'};
1422
1423	# 1.
1424	# Read the vocabulary file
1425	open(V, "<$vocabfile")
1426	\|\| die "Cannot open $vocabfile: $!";
1427	my @symbol;
1428	my $i = 1;
1429	while(<V>) {
1430	chomp;
1431	$symbol[$i++] = $_;
1432	}
1433	close V;
1434
1435	# 2.
1436	# Translate phrases.3 to MGPP input files
1437	my ($key, $text, $word, $isThesaurus, $line);
1438	my @fields;
1439	my $linenumber = 0;
1440
1441	open(IN, "<$infile");
1442	open(DATA, ">$datafile");
1443	open(WORD, ">$wordfile");
1444
1445	# Save the phrases in a separate text file
1446	if ($savephrases) {
1447	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1448	open(SAVE, ">$savephrases");
1449	}
1450
1451	while(<IN>) {
1452
1453	# read the line
1454	chomp;
1455	$line = $_;
1456	@fields = split(/:/, $line);
1457
1458	# get a phrase number for this line
1459	$key = shift @fields;
1460
1461	# restore the text of the phrase
1462	$text = shift @fields;
1463	$text =~ s/s(\d+)/$symbol[$1]/g;
1464	if ($text =~ / /) {
1465	$word = "";
1466	} elsif ($text ne 'untranslated') {
1467	$word = $text;
1468	}
1469
1470	# output the phrase data
1471	print DATA "<Document>";
1472	print DATA "$key:$text:", join(":", @fields), ":\n";
1473
1474	# output the word index search data
1475	print WORD "<Document>$word\n";
1476
1477	# output the phrases to a text file
1478	if ($savephrases) {
1479	if ((scalar @fields) == 7) {
1480	$isThesaurus = 1;
1481	} else {
1482	$isThesaurus = 0;
1483	}
1484	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1485	}
1486	}
1487	close IN;
1488	close WORD;
1489	close DATA;
1490	close SAVE if ($savephrases);
1491
1492	}
1493
1494
1495
1496	# sort routines used to renumber phrases
1497
1498	sub numerically { $a <=> $b }
1499
1500	sub by_doc_frequency {
1501	my $fa = 1;
1502	if ($a =~ /,/) {
1503	$fa = $a;
1504	$fa =~ s/\d+,//;
1505	}
1506	my $fb = 1;
1507	if ($b =~ /,/) {
1508	$fb = $b;
1509	$fb =~ s/\d+,//;
1510	}
1511
1512	return ($fb <=> $fa);
1513	}
1514
1515	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: