Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 7023

Last change on this file since 7023 was 6989, checked in by kjdon, 20 years ago
changed the language option to type string so you can have more than one lang when configuring it in GLI
Property svn:keywords set to `Author Date Id Revision`
File size: 40.3 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BasClas;
37	use util;
38	use ghtml;
39	use unicode;
40
41	my @removedirs = ();
42
43	my %wanted_index_files = ('td'=>1,
44	't'=>1,
45	'ti'=>1,
46	'tl'=>1,
47	'tsd'=>1,
48	'idb'=>1,
49	'ib1'=>1,
50	'ib2'=>1,
51	'ib3'=>1,
52	'i'=>1,
53	'il'=>1,
54	'w'=>1,
55	'wa'=>1);
56
57	sub BEGIN {
58	@ISA = ('BasClas');
59	}
60
61	sub END {
62
63	# Tidy up stray files - we do this here as there's some weird problem
64	# preventing us from doing it in the get_classify_info() function (on
65	# windows at least) where the close() appears to fail on txthandle and
66	# dochandle, thus preventing us from deleting those files
67
68	foreach my $dir (@removedirs) {
69	if (-d $dir && opendir (DIR, $dir)) {
70	my @files = readdir DIR;
71	closedir DIR;
72
73	foreach $file (@files) {
74	next if $file =~ /^\.\.?$/;
75	my ($suffix) = $file =~ /\.([^\.]+)$/;
76	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
77	# delete it!
78	&util::rm (&util::filename_cat ($dir, $file));
79	}
80	}
81	}
82	}
83	}
84
85	my $arguments =
86	[ { 'name' => "text",
87	'desc' => "{Phind.text}",
88	'type' => "string",
89	'deft' => "section:Title,section:text",
90	'reqd' => "no" },
91	{ 'name' => "title",
92	'desc' => "{Phind.title}",
93	'type' => "metadata",
94	'deft' => "Title",
95	'reqd' => "no" },
96	{ 'name' => "buttonname",
97	'desc' => "{BasClas.buttonname}",
98	'type' => "string",
99	'deft' => "Phrase",
100	'reqd' => "no" },
101	{ 'name' => "language",
102	'desc' => "{Phind.language}",
103	'type' => "string",
104	'deft' => "en",
105	'reqd' => "no" },
106	{ 'name' => "savephrases",
107	'desc' => "{Phind.savephrases}",
108	'type' => "string",
109	'deft' => "",
110	'reqd' => "no" },
111	{ 'name' => "suffixmode",
112	'desc' => "{Phind.suffixmode}",
113	'type' => "int",
114	'deft' => "1",
115	'reqd' => "no" },
116	{ 'name' => "thesaurus",
117	'desc' => "{Phind.thesaurus}",
118	'type' => "string",
119	'deft' => "",
120	'reqd' => "no" },
121	{ 'name' => "untidy",
122	'desc' => "{Phind.untidy}",
123	'type' => "flag",
124	'reqd' => "no" } ];
125
126	my $options = { 'name' => "Phind",
127	'desc' => "{Phind.desc}",
128	'abstract' => "no",
129	'inherits' => "yes",
130	'args' => $arguments };
131
132
133	# Phrase delimiter symbols - these should be abstracted out someplace
134
135	my $colstart = "COLLECTIONSTART";
136	my $colend = "COLLECTIONEND";
137	my $doclimit = "DOCUMENTLIMIT";
138	my $senlimit = "SENTENCELIMIT";
139	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
140
141
142	# Create a new Phind browser based on collect.cfg
143
144	sub new {
145	my $class = shift (@_);
146	my $self = new BasClas($class, @_);
147
148	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
149	my $option_list = $self->{'option_list'};
150	push( @{$option_list}, $options );
151
152	if ($self->{'info_only'}) {
153	# created from classinfo.pl - don't need to parse the arguments
154	return bless $self, $class;
155	}
156
157	my $out = $self->{'outhandle'};
158
159	# Ensure the Phind generate scripts are in place
160	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
161	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
162	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
163
164	if (!(-e $file1)) {
165	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
166	exit(1);
167	}
168
169	# Parse classifier arguments
170	my $builddir = "";
171	if (!parsargv::parse(\@_,
172	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
173	q^title/.*/Title^, \$self->{'titlefield'},
174	q^buttonname/.*/Phrase^, \$self->{'buttonname'},
175	q^language/.*/en^, \$language,
176	q^savephrases/.*/^, \$self->{'savephrases'},
177	q^suffixmode/\d/1^, \$self->{'suffixmode'},
178	q^thesaurus/.*/^, \$self->{'thesaurus'},
179	q^untidy^, \$self->{'untidy'},
180	"allow_extra_options")) {
181
182	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
183	$self->print_txt_usage(""); # Use default resource bundle
184	die "\n";
185	}
186
187	# classifier information
188	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
189
190	# limit languages
191	$self->{'language_exp'} = $language;
192
193	# collection directories
194	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
195
196	if (! defined $self->{'builddir'}) {
197	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
198	}
199
200	$self->{'total'} = 0;
201
202	return bless $self, $class;
203	}
204
205
206	# Initialise the Phind classifier
207
208	sub init {
209	my $self = shift (@_);
210
211	# ensure we have a build directory
212	my $builddir = $self->{'builddir'};
213	die unless (-e "$builddir");
214
215	# create Phind directory
216	my $phnumber = 1;
217	my $phinddir = &util::filename_cat($builddir, "phind1");
218	while (-e "$phinddir") {
219	$phnumber++;
220	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
221	}
222	&util::mk_dir("$phinddir");
223	$self->{'phinddir'} = $phinddir;
224	$self->{'phindnumber'} = $phnumber;
225
226	push(@removedirs, $phinddir) unless $self->{'untidy'};
227
228	# open filehandles for documents and text
229	my $clausefile = &util::filename_cat("$phinddir", "clauses");
230	&util::rm($clausefile) if (-e $clausefile);
231
232	my $txthandle = 'TEXT' . $phnumber;
233	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
234	$self->{'txthandle'} = $txthandle;
235
236	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
237	&util::rm($docfile) if (-e $docfile);
238
239	my $dochandle = 'DOC' . $phnumber;
240	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
241	$self->{'dochandle'} = $dochandle;
242
243	}
244
245
246	# Classify each document.
247	#
248	# Each document is passed here in turn. The classifier extracts the
249	# text of each and stores it in the clauses file. Document details are
250	# stored in the docs.txt file.
251
252	sub classify {
253	my $self = shift (@_);
254	my $doc_obj = shift @_;
255
256	my $verbosity = $self->{'verbosity'};
257	my $top_section = $doc_obj->get_top_section();
258
259	my $titlefield = $self->{'titlefield'};
260
261	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
262	if (!defined($title)) {
263	$title = "";
264	print STDERR "Phind: document has no title\n";
265	}
266	print "process: $title\n" if ($verbosity > 2);
267
268	# Only consider the file if it is in the correct language
269	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
270	my $phrlanguage = $self->{'language_exp'};
271	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
272
273	# record this file
274	$self->{'total'} ++;
275	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
276
277
278	# Store document details
279	my $OID = $doc_obj->get_OID();
280	$OID = "NULL" unless defined $OID;
281	my $dochandle = $self->{'dochandle'};
282	print $dochandle "<Document>\t$OID\t$title\n";
283
284	# Store the text occuring in this object
285
286	# output the document delimiter
287	my $txthandle = $self->{'txthandle'};
288	print $txthandle "$doclimit\n";
289
290	# iterarate over the required indexes and store their text
291	my $indexes = $self->{'indexes'};
292	my $text = "";
293	my ($part, $level, $field, $section, $data, $dataref);
294
295	foreach $part (split(/,/, $indexes)) {
296
297	# Each field has a level and a data element ((e.g. document:Title)
298	($level, $field) = split(/:/, $part);
299	die unless ($level && $field);
300
301	# Extract the text from every section
302	# (In phind, document:text and section:text are equivalent)
303	if ($field eq "text") {
304	$data = "";
305	$section = $doc_obj->get_top_section();
306	while (defined($section)) {
307	$data .= $doc_obj->get_text($section) . "\n";
308	$section = $doc_obj->get_next_section($section);
309	}
310	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311	}
312
313	# Extract a metadata field from a document
314	# (If there is more than one element of the given type, get them all.)
315	elsif ($level eq "document") {
316	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
317	foreach $data (@$dataref) {
318	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
319	}
320	}
321
322	# Extract metadata from every section in a document
323	elsif ($level eq "section") {
324	$data = "";
325	$section = $doc_obj->get_top_section();
326	while (defined($section)) {
327	$dataref = $doc_obj->get_metadata($section, $field);
328	$data .= join("\n", @$dataref) . "\n";
329	$section = $doc_obj->get_next_section($section);
330	}
331	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
332	}
333
334	# Some sort of specification which I don't understand
335	else {
336	die "Unknown level ($level) in Phind index ($part)\n";
337	}
338
339	}
340
341	# output the text
342	$text =~ tr/\n//s;
343	print $txthandle "$text";
344	}
345
346
347	# Construct the classifier from the information already gathered
348	#
349	# When get_classify_info is called, the clauses and docs.txt files have
350	# already been constructed in the Phind directory. This function will
351	# translate them into compressed, indexed MGPP files that can be read by
352	# the phindcgi script. It will also register our classifier so that it
353	# shows up in the navigation bar.
354
355	sub get_classify_info {
356	my $self = shift (@_);
357	my ($gli) = @_;
358
359	close $self->{'dochandle'};
360	close $self->{'txthandle'};
361	my $verbosity = $self->{'verbosity'};
362	my $out = $self->{'outhandle'};
363	my $phinddir = $self->{'phinddir'};
364
365	my $osextra = "";
366	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
367	$osextra = " -d /";
368	}
369
370	print STDERR "</Stage>\n" if $gli;
371
372	if ($verbosity) {
373	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
374	print $out "*** in ", $self->{'phinddir'}, "\n";
375	}
376
377	print STDERR "<Stage name='Phind'>\n" if $gli;
378
379	# Construct phind indexes
380	my $suffixmode = $self->{'suffixmode'};
381	my ($command, $status);
382
383	# Generate the vocabulary, symbol statistics, and numbers file
384	# from the clauses file
385	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
386	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
387	&extract_vocabulary($self);
388
389	# Use the suffix program to generate the phind/phrases file
390	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
391	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
392	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
393
394	# check that we generated some files. It's not necessarily an error if
395	# we didn't (execute() would have quit on error), but we can't go on.
396	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
397	if (! -r $phrasesfile) {
398	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
399	print $out "\nNo phrases found for Phind classifier!\n";
400	return;
401	}
402
403	# Create the phrase file and put phrase numbers in phind/phrases
404	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
405	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
406	&renumber_phrases($self);
407
408	print $out "\nCreating phrase databases\n";
409	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
410	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
411	my $mg_stem = &util::filename_cat($phinddir, "pdata");
412
413	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
414	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
415	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
416
417	# create the mg index of words
418	print $out "\nCreating word-level search indexes\n";
419	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
420	$mg_input = &util::filename_cat($phinddir, "pword.txt");
421	$mg_stem = &util::filename_cat($phinddir, "pword");
422
423	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
424	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
425	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
427	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
428	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
429
430	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
431	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
432	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
433
434	# create the mg document information database
435	print $out "\nCreating document information databases\n";
436	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
437	$mg_input = &util::filename_cat($phinddir, "docs.txt");
438	$mg_stem = &util::filename_cat($phinddir, "docs");
439
440	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
441	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
442	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
443
444	# Return the information about the classifier that we'll later want to
445	# use to create macros when the Phind classifier document is displayed.
446	my %classifyinfo = ('thistype'=>'Invisible',
447	'childtype'=>'Phind',
448	'Title'=>$self->{'buttonname'},
449	'parameters'=>"phindnumber=$self->{'phindnumber'}",
450	'contains'=>[]);
451
452	my $collection = $self->{'collection'};
453	my $url = "library?a=p&p=phind&c=$collection";
454	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
455
456	return \%classifyinfo;
457	}
458
459
460
461	sub convert_gml_to_tokens {
462
463	my ($language_exp, $text) = @_;
464
465	# escape any magic words... - jrm21
466	foreach my $delim (@delimiters) {
467	my $replacement=lc($delim);
468	my $num= $text=~ s/$delim/$replacement/g;
469	if (!$num) {$num=0;}
470	}
471
472	if ($language_exp =~ /en/) {
473	return &convert_gml_to_tokens_EN($text);
474	}
475
476	if ($language_exp =~ /zh/) {
477	return &convert_gml_to_tokens_ZH($text);
478	}
479
480	$_ = $text;
481
482	# 1. remove GML tags
483
484	# Remove everything that is in a tag
485	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
486	s/\s<br>\s/ LINEBREAK /isgo;
487	s/<[^>]*>/ /sgo;
488
489	# Now we have the text, but it may contain HTML
490	# elements coded as > etc. Remove these tags.
491	s/&/&/sgo;
492	s/</</sgo;
493	s/>/>/sgo;
494	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
495	s/\s<br>\s/ LINEBREAK /isgo;
496	s/<[^>]*>/ /sgo;
497
498	# replace<p> and <br> placeholders with clause break symbol (\n)
499	s/\s+/ /gso;
500	s/PARAGRAPHBREAK/\n/sgo;
501	s/LINEBREAK/\n/sgo;
502
503
504
505
506	# 2. Split the remaining text into space-delimited tokens
507
508	# Convert any HTML special characters (like ") to their UTF8 equivalent
509	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
510
511	# Split text at word boundaries
512	s/\b/ /go;
513
514	# 3. Convert the remaining text to "clause format"
515
516	# Insert newline if the end of a sentence is detected
517	# (delimter is: "[\.\?\!]\s")
518	# s/\s*[\.\?\!]\s+/\n/go;
519
520	# remove unnecessary punctuation and replace with clause break symbol (\n)
521	s/[^\w ]/\n/go;
522
523	# remove extraneous whitespace
524	s/ +/ /sgo;
525	s/^\s+//mgo;
526	s/\s*$/\n/mgo;
527
528	# remove lines that contain one word or less
529	s/^\S*$//mgo;
530	s/^\s*$//mgo;
531	tr/\n//s;
532
533	return $_;
534	}
535
536	# a chinese version
537	sub convert_gml_to_tokens_ZH {
538
539	$_ = shift @_;
540
541	# Replace all whitespace with a simple space
542	s/\s+/ /gs;
543	# Remove everything that is in a tag
544	s/\s<p>\s/ PARAGRAPHBREAK /isg;
545	s/\s<br>\s/ LINEBREAK /isg;
546	s/<[^>]*>/ /sg;
547
548	# Now we have the text, but it may contain HTML
549	# elements coded as > etc. Remove these tags.
550	s/</</sg;
551	s/>/>/sg;
552
553	s/\s+/ /sg;
554	s/\s<p>\s/ PARAGRAPHBREAK /isg;
555	s/\s<br>\s/ LINEBREAK /isg;
556	s/<[^>]*>/ /sg;
557
558	# remove & and other miscellaneous markup tags
559	s/&/&/sg;
560	s/</</sg;
561	s/>/>/sg;
562	s/&/&/sg;
563
564	# replace<p> and <br> placeholders with carriage returns
565	s/PARAGRAPHBREAK/\n/sg;
566	s/LINEBREAK/\n/sg;
567
568
569	# print STDERR "text:$_\n";
570	return $_;
571	}
572
573	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
574
575	sub convert_gml_to_tokens_EN {
576	$_ = shift @_;
577
578	# FIRST, remove GML tags
579
580	# Replace all whitespace with a simple space
581	s/\s+/ /gs;
582
583	# Remove everything that is in a tag
584	s/\s<p>\s/ PARAGRAPHBREAK /isg;
585	s/\s<br>\s/ LINEBREAK /isg;
586	s/<[^>]*>/ /sg;
587
588	# Now we have the text, but it may contain HTML
589	# elements coded as > etc. Remove these tags.
590	s/</</sg;
591	s/>/>/sg;
592
593	s/\s+/ /sg;
594	s/\s<p>\s/ PARAGRAPHBREAK /isg;
595	s/\s<br>\s/ LINEBREAK /isg;
596	s/<[^>]*>/ /sg;
597
598	# remove & and other miscellaneous markup tags
599	s/&/&/sg;
600	s/</</sg;
601	s/>/>/sg;
602	s/&/&/sg;
603
604	# replace<p> and <br> placeholders with carriage returns
605	s/PARAGRAPHBREAK/\n/sg;
606	s/LINEBREAK/\n/sg;
607
608
609	# Exceptional punctuation
610	#
611	# We make special cases of some punctuation
612
613	# remove any apostrophe that indicates omitted letters
614	s/(\w+)\'(\w*\s)/ $1$2 /g;
615
616	# remove period that appears in a person's initals
617	s/\s([A-Z])\./ $1 /g;
618
619	# replace hyphens in hypheanted words and names with a space
620	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
621
622	# Convert the remaining text to "clause format",
623	# This means removing all excess punctuation and garbage text,
624	# normalising valid punctuation to fullstops and commas,
625	# then putting one cluse on each line.
626
627	# Insert newline when the end of a sentence is detected
628	# (delimter is: "[\.\?\!]\s")
629	s/\s*[\.\?\!]\s+/\n/g;
630
631	# split numbers after four digits
632	s/(\d\d\d\d)/$1 /g;
633
634	# split words after 32 characters
635
636	# squash repeated punctuation
637	tr/A-Za-z0-9 //cs;
638
639	# save email addresses
640	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
641
642	# normalise clause breaks (mostly punctuation symbols) to commas
643	s/[^A-Za-z0-9 \n]+/ , /g;
644
645	# Remove repeated commas, and replace with newline
646	s/\s*,[, ]+/\n/g;
647
648	# remove extra whitespace
649	s/ +/ /sg;
650	s/^\s+//mg;
651	s/\s*$/\n/mg;
652
653	# remove lines that contain one word or less
654	s/^\w*$//mg;
655	s/^\s*$//mg;
656	tr/\n//s;
657
658	return $_;
659
660	}
661
662
663
664	# Execute a system command
665
666	sub execute {
667	my ($command, $verbosity, $outhandle) = @_;
668	print $outhandle "Executing: $command\n" if ($verbosity > 2);
669	$! = 0;
670	my $status = system($command);
671	if ($status != 0) {
672	print STDERR "Phind - Error executing '$command': $!\n";
673	exit($status); # this causes the build to fail...
674	}
675	}
676
677
678	# Generate the vocabulary, symbol statistics, and numbers file from the
679	# clauses file. This is legacy code, so is a bit messy and probably wont
680	# run under windows.
681
682	sub extract_vocabulary {
683	my ($self) = @_;
684
685	my $verbosity = $self->{'verbosity'};
686	my $out = $self->{'outhandle'};
687
688	my $collectiondir = $self->{'collectiondir'};
689	my $phinddir = $self->{'phinddir'};
690
691	my $language_exp = $self->{'language_exp'};
692
693	my ($w, $l, $line, $word);
694
695	my ($first_delimiter, $last_delimiter,
696	$first_stopword, $last_stopword,
697	$first_extractword, $last_extractword,
698	$first_contentword, $last_contentword,
699	$phrasedelimiter);
700
701	my $thesaurus = $self->{'thesaurus'};
702	my ($thesaurus_links, $thesaurus_terms,
703	%thesaurus, $first_thesaurusword, $last_thesaurusword);
704
705	my %symbol;
706	my (%freq);
707
708	print $out "Calculating vocabulary\n" if ($verbosity > 1);
709
710	# Read and store the stopwords
711	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
712	my $stopword_files = ();
713	my ($language, $language_dir, $file, $file_name);
714	my %stopwords;
715
716	# Examine each directory in the stopword directory
717	opendir(STOPDIR, $stopdir);
718	foreach $language (readdir STOPDIR) {
719
720	# Ignore entries that do not match the classifier's language
721	next unless ($language =~ /$language_exp/);
722	$language_dir = &util::filename_cat($stopdir, $language);
723	next unless (-d "$language_dir");
724
725	opendir(LANGDIR, $language_dir);
726	foreach $file (readdir LANGDIR) {
727
728	# Ignore entries that are not stopword files
729	next unless ($file =~ /sw$/);
730	$file_name = &util::filename_cat($language_dir, $file);
731	next unless (-f "$file_name");
732
733	# Read the stopwords
734	open(STOPFILE, "<$file_name");
735	while (<STOPFILE>) {
736	s/^\s+//;
737	s/\s.*//;
738	$word = $_;
739	$l = lc($word);
740	$stopwords{$l} = $word;
741	}
742	close STOPFILE;
743
744	}
745	closedir LANGDIR;
746	}
747	closedir STOPDIR;
748
749	# Read thesaurus information
750	if ($thesaurus) {
751
752	# link file exists
753	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
754	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
755
756	# ensure term file exists in the correct language
757	if ($language_exp =~ /^([a-z][a-z])/) {
758	$language = $1;
759	} else {
760	$language = 'en';
761	}
762	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
763	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
764
765
766	# Read the thesaurus terms
767	open(TH, "<$thesaurus_terms");
768	while(<TH>) {
769	s/^\d+ //;
770	s/\(.*\)//;
771	foreach $w (split(/\s+/, $_)) {
772	$thesaurus{lc($w)} = $w;
773	}
774	}
775	close TH;
776	}
777
778	# Read words in the text and count occurences
779	open(TXT, "<$phinddir/clauses");
780
781	my @words;
782	while(<TXT>) {
783	$line = $_;
784	next unless ($line =~ /./);
785
786	@words = split(/\s+/, $line);
787	foreach $w (@words) {
788	$l = lc($w);
789	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
790	$freq{$w}++;
791	}
792	$freq{$senlimit}++;
793	}
794
795	close TXT;
796
797	# Calculate the "best" form of each word
798	my (%bestform, %totalfreq, %bestfreq);
799
800	foreach $w (sort (keys %freq)) {
801	$l = lc($w);
802
803	# totalfreq is the number of times a term appears in any form
804	$totalfreq{$l} += $freq{$w};
805
806	if (defined $stopwords{$l}) {
807	$bestform{$l} = $stopwords{$l};
808
809	} elsif (defined $thesaurus{$l}) {
810	$bestform{$l} = $thesaurus{$l};
811
812	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
813	$bestfreq{$l} = $freq{$w};
814	$bestform{$l} = $w;
815	}
816	}
817	undef %freq;
818	undef %bestfreq;
819
820
821	# Assign symbol numbers to tokens
822	my $nextsymbol = 1;
823	my (@vocab);
824
825	# Delimiters
826	$first_delimiter = 1;
827
828	foreach $word (@delimiters) {
829
830	# $word = lc($word); # jrm21
831	$word = uc($word);
832	$bestform{$word} = $word;
833	$vocab[$nextsymbol] = $word;
834	$symbol{$word} = $nextsymbol;
835	$nextsymbol++;
836	}
837	$last_delimiter = $nextsymbol - 1;
838	# Stopwords
839	$first_stopword = $nextsymbol;
840
841	foreach my $word (sort keys %stopwords) {
842	# don't include stopword unless it occurs in the text
843	$word = lc($word);
844	next unless ($totalfreq{$word});
845	next if ($symbol{$word});
846
847	$vocab[$nextsymbol] = $word;
848	$symbol{$word} = $nextsymbol;
849	$nextsymbol++;
850	}
851	$last_stopword = $nextsymbol - 1;
852	$first_contentword = $nextsymbol;
853
854	# Thesaurus terms
855	if ($thesaurus) {
856	$first_thesaurusword = $nextsymbol;
857
858	foreach my $word (sort keys %thesaurus) {
859
860	$word = lc($word);
861	next if ($symbol{$word});
862	$bestform{$word} = $thesaurus{$word};
863
864	$vocab[$nextsymbol] = $word;
865	$symbol{$word} = $nextsymbol;
866	$nextsymbol++;
867
868	}
869	$last_thesaurusword = $nextsymbol - 1;
870	}
871
872	# Other content words
873	$first_extractword = $nextsymbol;
874
875	foreach my $word (sort (keys %bestform)) {
876
877	next if ($symbol{$word});
878
879	$vocab[$nextsymbol] = $word;
880	$symbol{$word} = $nextsymbol;
881	$nextsymbol++;
882	}
883	$last_extractword = $nextsymbol - 1;
884	$last_contentword = $nextsymbol - 1;
885
886	# Outut the words
887	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
888	open(VOC, ">$phinddir/clauses.vocab");
889
890	for (my $i = 1; $i < $nextsymbol; $i++) {
891	$w = $vocab[$i];
892
893	print VOC "$bestform{$w}\n";
894	$totalfreq{$w} = 0 unless ($totalfreq{$w});
895	}
896	close VOC;
897
898
899	# Create statistics file
900	# Output statistics about the vocablary
901	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
902	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
903
904	open(STAT, ">$phinddir/clauses.stats")
905	\|\| die "Cannot open $phinddir/clauses.stats: $!";
906
907	print STAT "first_delimiter $first_delimiter\n";
908	print STAT "last_delimiter $last_delimiter\n";
909	print STAT "first_stopword $first_stopword\n";
910	print STAT "last_stopword $last_stopword\n";
911	if ($thesaurus) {
912	print STAT "first_thesaurusword $first_thesaurusword\n";
913	print STAT "last_thesaurusword $last_thesaurusword\n";
914	}
915	print STAT "first_extractword $first_extractword\n";
916	print STAT "last_extractword $last_extractword\n";
917	print STAT "first_contentword $first_contentword\n";
918	print STAT "last_contentword $last_contentword\n";
919	print STAT "first_symbol $first_delimiter\n";
920	print STAT "last_symbol $last_contentword\n";
921	print STAT "first_word $first_stopword\n";
922	print STAT "last_word $last_contentword\n";
923	close STAT;
924
925	undef @vocab;
926
927
928	# Create numbers file
929	# Save text as symbol numbers
930	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
931
932	open(TXT, "<$phinddir/clauses");
933	open(NUM, ">$phinddir/clauses.numbers");
934
935	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
936	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
937	$phrasedelimiter = $symbol{$senlimit};
938	print NUM "$symbol{$colstart}\n";
939
940	# set up the special symbols that delimit documents and sentences
941	while(<TXT>) {
942
943	# split sentence into a list of tokens
944	$line = $_;
945	next unless ($line =~ /./);
946	@words = split(/\s+/, $line);
947
948	# output one token at a time
949	foreach $word (@words) {
950	# don't lower-case special delimiters - jrm21
951	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
952	$word = lc($word);
953	}
954	print NUM "$symbol{$word}\n";
955	}
956
957	# output phrase delimiter
958	print NUM "$phrasedelimiter\n";
959	}
960
961	close TXT;
962	# print NUM "$symbol{lc($colend)}\n";# jrm21
963	print NUM "$symbol{$colend}\n";
964	close NUM;
965
966	# Save thesaurus data in one convienient file
967	if ($thesaurus) {
968
969	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
970
971
972	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
973	if ($verbosity > 1);
974
975	# Read the thesaurus terms
976	my ($num, $text, %thes_symbols);
977
978	open(TH, "<$thesaurus_terms");
979	while(<TH>) {
980	chomp;
981	@words = split(/\s+/, $_);
982	$num = shift @words;
983	$text = "";
984
985	# translate words into symbol numbers
986	foreach $word (@words) {
987	$word = lc($word);
988	if ($symbol{$word}) {
989	$text .= "s$symbol{$word} ";
990	} elsif ($verbosity) {
991	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
992	}
993	}
994	$text =~ s/ $//;
995	$thes_symbols{$num} = $text;
996	}
997	close TH;
998
999	# Read the thesaurus links and write the corresponding data
1000	open(TH, "<$thesaurus_links");
1001	open(THOUT, ">$thesaurusfile");
1002
1003	while(<TH>) {
1004	chomp;
1005	($num, $text) = split(/:/, $_);
1006
1007	if (defined($thes_symbols{$num})) {
1008	print THOUT "$num:$thes_symbols{$num}:$text\n";
1009	} else {
1010	print THOUT "$num:untranslated:$text\n";
1011	}
1012	}
1013	close TH;
1014	close THOUT;
1015	}
1016
1017
1018
1019
1020	}
1021
1022
1023	# renumber_phrases
1024	#
1025	# Prepare the phrases file to be input to mgpp. The biggest problem is
1026	# reconciling the phrase identifiers used by the suffix program (which
1027	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1028	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1029	# Phind-id numbers must be sorted by frequency of occurance.
1030	#
1031	# Start creating a set of phind-id numbers from the sorted suffix-id
1032	# numbers and (if required) the thesaurus-id numbers. Then add any other
1033	# phrases occuring in the thesaurus.
1034	#
1035	# The last thing we have to do is restore the vocabulary information to the
1036	# phrase file so that the phrases are stored as words, not as symbol
1037	# numbers.
1038
1039	# The original phrases file looks something like this:
1040	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1041	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1042	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1043	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1044
1045
1046	sub renumber_phrases {
1047	my ($self) = @_;
1048
1049	renumber_suffix_data($self);
1050	renumber_thesaurus_data($self);
1051	restore_vocabulary_data($self);
1052
1053	}
1054
1055
1056
1057	# renumber_suffix_data
1058	#
1059	# Translate phrases file to phrases.2 using phind keys instead
1060	# of suffix keys and sorting the expansion data.
1061
1062	sub renumber_suffix_data {
1063	my ($self) = @_;
1064
1065	my $verbosity = $self->{'verbosity'};
1066	my $out = $self->{'outhandle'};
1067	print $out "Translate phrases: suffix-ids become phind-id's\n"
1068	if ($verbosity);
1069
1070	my $phinddir = $self->{'phinddir'};
1071	my $infile = &util::filename_cat($phinddir, 'phrases');
1072	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1073
1074	# Read the phrase file. Calculate initial set of phind-id
1075	# numbers and store (suffixid -> frequency) relation.
1076
1077	my %suffixtophind;
1078	my @phindfrequency;
1079	my (@fields, $suffixid);
1080	my $nextphind = 1;
1081
1082	open(IN, "<$infile");
1083	while(<IN>) {
1084
1085	chomp;
1086	@fields = split(/:/, $_);
1087
1088	# get next suffixid and phindid
1089	$suffixid = shift @fields;
1090	$suffixtophind{$suffixid} = $nextphind;
1091
1092	# store total frequency
1093	shift @fields;
1094	$totalfrequency[$nextphind] = shift @fields;
1095
1096	$nextphind++;
1097	}
1098	close IN;
1099
1100
1101	# Translate phrases file to phrases.2. Use phind keys (not suffix
1102	# keys), sort expansion and document occurance data in order of
1103	# descending frequency..
1104	open(IN, "<$infile");
1105	open(OUT, ">$outfile");
1106
1107	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1108	my (@documwents, @newexp, $k, $n);
1109	my $linenumber = 0;
1110
1111	while(<IN>) {
1112
1113	# read the line
1114	chomp;
1115	@fields = split(/:/, $_);
1116
1117	# get a phrase number for this line
1118	$suffixid = shift @fields;
1119	die unless (defined($suffixtophind{$suffixid}));
1120	$phindid = $suffixtophind{$suffixid};
1121
1122	# get the symbols in the phrase
1123	$text = shift @fields;
1124
1125	# output status information
1126	$linenumber++;
1127	if ($verbosity > 2) {
1128	if ($linenumber % 1000 == 0) {
1129	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1130	}
1131	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1132	}
1133
1134	# get the phrase frequency
1135	$tf = shift @fields;
1136
1137	# get the number of expansions
1138	$countexp = shift @fields;
1139
1140	# get the expansions, convert them into phind-id numbers, and sort them
1141	$expansions = shift @fields;
1142	@newexp = ();
1143	foreach $k (split(/,/, $expansions)) {
1144	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1145	$n = $suffixtophind{$k};
1146	push @newexp, $n;
1147	}
1148	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1149
1150	# get the number of documents
1151	$countdocs = shift @fields;
1152
1153	# get the documents and sort them
1154	$documents = shift @fields;
1155	$documents =~ s/d//g;
1156	@documents = split(/;/, $documents);
1157	@documents = sort by_doc_frequency @documents;
1158
1159	# output the phrase data
1160	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1161	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1162
1163	}
1164
1165	close IN;
1166	close OUT;
1167	}
1168
1169
1170	# renumber_thesaurus_data
1171	#
1172	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1173
1174	sub renumber_thesaurus_data {
1175	my ($self) = @_;
1176
1177	my $out = $self->{'outhandle'};
1178	my $verbosity = $self->{'verbosity'};
1179	my $thesaurus = $self->{'thesaurus'};
1180
1181	my $phinddir = $self->{'phinddir'};
1182	my $infile = &util::filename_cat($phinddir, "phrases.2");
1183	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1184
1185
1186	# If no thesaurus is defined, simply move the phrases file.
1187	if (!$thesaurus) {
1188	print $out "Translate phrases.2: no thesaurus data\n"
1189	if ($verbosity);
1190	&util::mv($infile, $outfile);
1191	return;
1192	}
1193
1194	print $out "Translate phrases.2: add thesaurus data\n"
1195	if ($verbosity);
1196
1197	# 1.
1198	# Read thesaurus file and store (symbols->thesaurusid) mapping
1199	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1200	my %symbolstothesid;
1201	my (@fields, $thesid, $symbols);
1202
1203	open(TH, "<$thesaurusfile");
1204
1205	while (<TH>) {
1206
1207	chomp;
1208	@fields = split(/:/, $_);
1209
1210	# get id and text
1211	$thesid = shift @fields;
1212	$symbols = shift @fields;
1213	$symbolstothesid{$symbols} = $thesid;
1214	}
1215	close TH;
1216
1217	# 2.
1218	# Read phrases file to find thesaurus entries that already
1219	# have a phindid. Store their phind-ids for later translation,
1220	# and store their frequency for later sorting.
1221	my %thesaurustophindid;
1222	my %phindidtofrequency;
1223	my ($phindid, $freq);
1224
1225	open(IN, "<$infile");
1226
1227	while(<IN>) {
1228
1229	chomp;
1230	@fields = split(/:/, $_);
1231
1232	# phindid and symbols for this line
1233	$phindid = shift @fields;
1234	$symbols = shift @fields;
1235	$freq = shift @fields;
1236
1237	# do we have a thesaurus id corresponding to this phrase?
1238	if (defined($symbolstothesid{$symbols})) {
1239	$thesid = $symbolstothesid{$symbols};
1240	$thesaurustophindid{$thesid} = $phindid;
1241	$phindidtofrequency{$phindid} = $freq;
1242	}
1243	}
1244	close IN;
1245
1246	undef %symbolstothesid;
1247
1248	# 3.
1249	# Create phind-id numbers for remaining thesaurus entries,
1250	# and note that their frequency is 0 for later sorting.
1251	my $nextphindid = $phindid + 1;
1252
1253	open(TH, "<$thesaurusfile");
1254	while(<TH>) {
1255
1256	chomp;
1257	@fields = split(/:/, $_);
1258
1259	# read thesaurus-id and ensure it has a corresponding phind-id
1260	$thesid = shift @fields;
1261	if (!defined($thesaurustophindid{$thesid})) {
1262	$thesaurustophindid{$thesid} = $nextphindid;
1263	$phindidtofrequency{$nextphindid} = 0;
1264	$nextphindid++;
1265	}
1266	}
1267	close TH;
1268
1269	# 4.
1270	# Translate thesaurus file, replacing thesaurus-id numbers with
1271	# phind-id numbers.
1272	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1273	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1274	my (@links, $linkid, %linkidtotype, $newrelation);
1275
1276	open(TH, "<$thesaurusfile");
1277	open(TO, ">$newthesaurusfile");
1278	while(<TH>) {
1279
1280	chomp;
1281	@fields = split(/:/, $_);
1282
1283	# phindid and symbols for this line
1284	($thesid, $symbols, $relations) = @fields;
1285
1286	die unless ($thesid && $symbols);
1287	die unless $thesaurustophindid{$thesid};
1288	$phindid = $thesaurustophindid{$thesid};
1289
1290	# convert each part of the relation string to use phind-id numbers
1291	# at the same time, we want to sort the list by frequency.
1292	undef %linkidtotype;
1293
1294	foreach $linktext (split(/;/, $relations)) {
1295	@linkdata = split(/,/, $linktext);
1296
1297	# remember the linktype (e.g. BT, NT)
1298	$linktype = shift @linkdata;
1299
1300	# store the type of each link
1301	foreach $thesid (@linkdata) {
1302	die unless (defined($thesaurustophindid{$thesid}));
1303	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1304	}
1305	}
1306
1307	# sort the list of links, first by frequency, then by type.
1308	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1309	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1310	$linkcounter = (scalar @links);
1311
1312	# create a string describing the link information
1313	$linktype = $linkidtotype{$links[0]};
1314	$newrelation = $linktype;
1315	foreach $linkid (@links) {
1316	if ($linkidtotype{$linkid} ne $linktype) {
1317	$linktype = $linkidtotype{$linkid};
1318	$newrelation .= ";" . $linktype;
1319	}
1320	$newrelation .= "," . $linkid;
1321	}
1322	$newrelation .= ";";
1323
1324
1325	# output the new line
1326	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1327	}
1328	close TH;
1329	close TO;
1330
1331	undef %thesaurustophindid;
1332	undef %linkidtotype;
1333	undef %phindidtofrequency;
1334
1335	# 5.
1336	# Read thesaurus data (in phind-id format) into memory
1337	my %thesaurusdata;
1338
1339	open(TH, "<$newthesaurusfile");
1340	while(<TH>) {
1341	chomp;
1342	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1343	die unless ($phindid && $symbols);
1344	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1345	}
1346	close TH;
1347
1348	# 6.
1349	# Add thesaurus data to phrases file
1350	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1351	my (@documwents, @newexp, $k, $n);
1352	my $linenumber = 0;
1353
1354	open(IN, "<$infile");
1355	open(OUT, ">$outfile");
1356
1357	# Update existing phrases
1358	while(<IN>) {
1359
1360	chomp;
1361	@fields = split(/:/, $_);
1362
1363	# get data for this line
1364	$phindid = shift @fields;
1365
1366	# output the phrase data, with thesaurus information
1367	print OUT "$phindid:", join(":", @fields);
1368
1369	# add thesaurus data
1370	if (defined($thesaurusdata{$phindid})) {
1371	@fields = split(/:/, $thesaurusdata{$phindid});
1372	shift @fields;
1373	$linkcounter = shift @fields;
1374	$relations = shift @fields;
1375
1376	print OUT ":$linkcounter:$relations";
1377	$thesaurusdata{$phindid} = "";
1378	}
1379	print OUT "\n";
1380	}
1381	close IN;
1382
1383	# Add phrases that aren't already in the file
1384	foreach $phindid (sort numerically keys %thesaurusdata) {
1385	next unless ($thesaurusdata{$phindid});
1386
1387	@fields = split(/:/, $thesaurusdata{$phindid});
1388	$symbols = shift @fields;
1389	$linkcounter = shift @fields;
1390	$relations = shift @fields;
1391
1392	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1393	}
1394	close OUT;
1395
1396	}
1397
1398	# restore_vocabulary_data
1399	#
1400	# Read phrases.3 and restore vocabulary information. Then write
1401	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1402	# (if requested) to the saved phrases file.
1403
1404	sub restore_vocabulary_data {
1405	my ($self) = @_;
1406
1407	my $out = $self->{'outhandle'};
1408	my $verbosity = $self->{'verbosity'};
1409	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1410
1411	my $phinddir = $self->{'phinddir'};
1412	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1413	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1414	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1415	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1416
1417	my $savephrases = $self->{'savephrases'};
1418
1419	# 1.
1420	# Read the vocabulary file
1421	open(V, "<$vocabfile")
1422	\|\| die "Cannot open $vocabfile: $!";
1423	my @symbol;
1424	my $i = 1;
1425	while(<V>) {
1426	chomp;
1427	$symbol[$i++] = $_;
1428	}
1429	close V;
1430
1431	# 2.
1432	# Translate phrases.3 to MGPP input files
1433	my ($key, $text, $word, $isThesaurus);
1434	my @fields;
1435	my $linenumber = 0;
1436
1437	open(IN, "<$infile");
1438	open(DATA, ">$datafile");
1439	open(WORD, ">$wordfile");
1440
1441	# Save the phrases in a separate text file
1442	if ($savephrases) {
1443	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1444	open(SAVE, ">$savephrases");
1445	}
1446
1447	while(<IN>) {
1448
1449	# read the line
1450	chomp;
1451	$line = $_;
1452	@fields = split(/:/, $line);
1453
1454	# get a phrase number for this line
1455	$key = shift @fields;
1456
1457	# restore the text of the phrase
1458	$text = shift @fields;
1459	$text =~ s/s(\d+)/$symbol[$1]/g;
1460	if ($text =~ / /) {
1461	$word = "";
1462	} elsif ($text ne 'untranslated') {
1463	$word = $text;
1464	}
1465
1466	# output the phrase data
1467	print DATA "<Document>";
1468	print DATA "$key:$text:", join(":", @fields), ":\n";
1469
1470	# output the word index search data
1471	print WORD "<Document>$word\n";
1472
1473	# output the phrases to a text file
1474	if ($savephrases) {
1475	if ((scalar @fields) == 7) {
1476	$isThesaurus = 1;
1477	} else {
1478	$isThesaurus = 0;
1479	}
1480	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1481	}
1482	}
1483	close IN;
1484	close WORD;
1485	close DATA;
1486	close SAVE if ($savephrases);
1487
1488	}
1489
1490
1491
1492	# sort routines used to renumber phrases
1493
1494	sub numerically { $a <=> $b }
1495
1496	sub by_doc_frequency {
1497	my $fa = 1;
1498	if ($a =~ /,/) {
1499	$fa = $a;
1500	$fa =~ s/\d+,//;
1501	}
1502	my $fb = 1;
1503	if ($b =~ /,/) {
1504	$fb = $b;
1505	$fb =~ s/\d+,//;
1506	}
1507
1508	return ($fb <=> $fa);
1509	}
1510
1511	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: