Context Navigation

source: gsdl/trunk/perllib/classify/Phind.pm@ 18455

Last change on this file since 18455 was 18455, checked in by davidb, 15 years ago
Addition of 'edit_mode' parameter to classify(). This can be either 'add' 'delete' or 'reindex' (should think about renaming the last one to something more appropriate, e.g. update).
Property svn:keywords set to `Author Date Id Revision`
File size: 40.3 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BaseClassifier;
37	use util;
38	use ghtml;
39	use unicode;
40
41	use strict;
42	no strict 'refs'; # allow filehandles to be variables and viceversa
43
44	my @removedirs = ();
45
46	my %wanted_index_files = ('td'=>1,
47	't'=>1,
48	'ti'=>1,
49	'tl'=>1,
50	'tsd'=>1,
51	'idb'=>1,
52	'ib1'=>1,
53	'ib2'=>1,
54	'ib3'=>1,
55	'i'=>1,
56	'il'=>1,
57	'w'=>1,
58	'wa'=>1);
59
60	sub BEGIN {
61	@Phind::ISA = ('BaseClassifier');
62	}
63
64	sub END {
65
66	# Tidy up stray files - we do this here as there's some weird problem
67	# preventing us from doing it in the get_classify_info() function (on
68	# windows at least) where the close() appears to fail on txthandle and
69	# dochandle, thus preventing us from deleting those files
70
71	foreach my $dir (@removedirs) {
72	if (-d $dir && opendir (DIR, $dir)) {
73	my @files = readdir DIR;
74	closedir DIR;
75
76	foreach my $file (@files) {
77	next if $file =~ /^\.\.?$/;
78	my ($suffix) = $file =~ /\.([^\.]+)$/;
79	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
80	# delete it!
81	&util::rm (&util::filename_cat ($dir, $file));
82	}
83	}
84	}
85	}
86	}
87
88	my $arguments =
89	[ { 'name' => "text",
90	'desc' => "{Phind.text}",
91	'type' => "string",
92	'deft' => "section:Title,section:text",
93	'reqd' => "no" },
94	{ 'name' => "title",
95	'desc' => "{Phind.title}",
96	'type' => "metadata",
97	'deft' => "Title",
98	'reqd' => "no" },
99	{ 'name' => "buttonname",
100	'desc' => "{BasClas.buttonname}",
101	'type' => "string",
102	'deft' => "Phrase",
103	'reqd' => "no" },
104	{ 'name' => "language",
105	'desc' => "{Phind.language}",
106	'type' => "string",
107	'deft' => "en",
108	'reqd' => "no" },
109	{ 'name' => "savephrases",
110	'desc' => "{Phind.savephrases}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no" },
114	{ 'name' => "suffixmode",
115	'desc' => "{Phind.suffixmode}",
116	'type' => "int",
117	'deft' => "1",
118	'range' => "0,1",
119	'reqd' => "no" },
120	{ 'name' => "min_occurs",
121	'desc' => "{Phind.min_occurs}",
122	'type' => "int",
123	'deft' => "2",
124	'range' => "1,",
125	'reqd' => "no" },
126	{ 'name' => "thesaurus",
127	'desc' => "{Phind.thesaurus}",
128	'type' => "string",
129	'deft' => "",
130	'reqd' => "no" },
131	{ 'name' => "untidy",
132	'desc' => "{Phind.untidy}",
133	'type' => "flag",
134	'reqd' => "no" } ];
135
136	my $options = { 'name' => "Phind",
137	'desc' => "{Phind.desc}",
138	'abstract' => "no",
139	'inherits' => "yes",
140	'args' => $arguments };
141
142
143	# Phrase delimiter symbols - these should be abstracted out someplace
144
145	my $colstart = "COLLECTIONSTART";
146	my $colend = "COLLECTIONEND";
147	my $doclimit = "DOCUMENTLIMIT";
148	my $senlimit = "SENTENCELIMIT";
149	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
150
151
152	# Create a new Phind browser based on collect.cfg
153
154	sub new {
155	my ($class) = shift (@_);
156	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
157	push(@$classifierslist, $class);
158
159	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
160	push(@{$hashArgOptLists->{"OptList"}},$options);
161
162	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
163
164	if ($self->{'info_only'}) {
165	# don't worry about any options etc
166	return bless $self, $class;
167	}
168
169	# Ensure the Phind generate scripts are in place
170	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
171	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
172	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
173	if (!(-e $file1)) {
174	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
175	exit(1);
176	}
177
178	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
179	$self->{"indexes"} = $self->{"text"};
180
181	# Further setup
182	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
183	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
184	if (! defined $self->{'builddir'}) {
185	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
186	}
187	$self->{'total'} = 0;
188
189	# Clean out the unused keys
190	delete $self->{"text"};
191
192	return bless $self, $class;
193	}
194
195
196	# Initialise the Phind classifier
197
198	sub init {
199	my $self = shift (@_);
200
201	# ensure we have a build directory
202	my $builddir = $self->{'builddir'};
203	die unless (-e "$builddir");
204
205	# create Phind directory
206	my $phnumber = 1;
207	my $phinddir = &util::filename_cat($builddir, "phind1");
208	while (-e "$phinddir") {
209	$phnumber++;
210	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
211	}
212	&util::mk_dir("$phinddir");
213	$self->{'phinddir'} = $phinddir;
214	$self->{'phindnumber'} = $phnumber;
215
216	push(@removedirs, $phinddir) unless $self->{'untidy'};
217
218	# open filehandles for documents and text
219	my $clausefile = &util::filename_cat("$phinddir", "clauses");
220	&util::rm($clausefile) if (-e $clausefile);
221
222	my $txthandle = 'TEXT' . $phnumber;
223	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
224	$self->{'txthandle'} = $txthandle;
225
226	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
227	&util::rm($docfile) if (-e $docfile);
228
229	my $dochandle = 'DOC' . $phnumber;
230	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
231	$self->{'dochandle'} = $dochandle;
232
233	}
234
235
236	# Classify each document.
237	#
238	# Each document is passed here in turn. The classifier extracts the
239	# text of each and stores it in the clauses file. Document details are
240	# stored in the docs.txt file.
241
242	sub classify {
243	my $self = shift (@_);
244	my ($doc_obj,$edit_mode) = @_;
245
246	my $verbosity = $self->{'verbosity'};
247	my $top_section = $doc_obj->get_top_section();
248
249	my $titlefield = $self->{'title'};
250
251	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
252	if (!defined($title)) {
253	$title = "";
254	print STDERR "Phind: document has no title\n";
255	}
256	print "process: $title\n" if ($verbosity > 2);
257
258	# Only consider the file if it is in the correct language
259	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
260	my $phrlanguage = $self->{'language'};
261	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
262
263	if ($edit_mode eq "delete") {
264	# This classifier works quite differently to the others
265	# Probably doesn't support incremental building anyway
266	return;
267	}
268
269	# record this file
270	$self->{'total'} ++;
271	# what is $file ???
272	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
273
274	# Store document details
275	my $OID = $doc_obj->get_OID();
276	$OID = "NULL" unless defined $OID;
277	my $dochandle = $self->{'dochandle'};
278	print $dochandle "<Document>\t$OID\t$title\n";
279
280	# Store the text occuring in this object
281
282	# output the document delimiter
283	my $txthandle = $self->{'txthandle'};
284	print $txthandle "$doclimit\n";
285
286	# iterate over the required indexes and store their text
287	my $indexes = $self->{'indexes'};
288	my $text = "";
289	my ($part, $level, $field, $section, $data, $dataref);
290
291	foreach $part (split(/,/, $indexes)) {
292
293	# Each field has a level and a data element ((e.g. document:Title)
294	($level, $field) = split(/:/, $part);
295	die unless ($level && $field);
296
297	# Extract the text from every section
298	# (In phind, document:text and section:text are equivalent)
299	if ($field eq "text") {
300	$data = "";
301	$section = $doc_obj->get_top_section();
302	while (defined($section)) {
303	$data .= $doc_obj->get_text($section) . "\n";
304	$section = $doc_obj->get_next_section($section);
305	}
306	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
307	}
308
309	# Extract a metadata field from a document
310	# (If there is more than one element of the given type, get them all.)
311	elsif ($level eq "document") {
312	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
313	foreach $data (@$dataref) {
314	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
315	}
316	}
317
318	# Extract metadata from every section in a document
319	elsif ($level eq "section") {
320	$data = "";
321	$section = $doc_obj->get_top_section();
322	while (defined($section)) {
323	$dataref = $doc_obj->get_metadata($section, $field);
324	$data .= join("\n", @$dataref) . "\n";
325	$section = $doc_obj->get_next_section($section);
326	}
327	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
328	}
329
330	# Some sort of specification which I don't understand
331	else {
332	die "Unknown level ($level) in Phind index ($part)\n";
333	}
334
335	}
336
337	# output the text
338	$text =~ tr/\n//s;
339	print $txthandle "$text";
340	}
341
342
343	# Construct the classifier from the information already gathered
344	#
345	# When get_classify_info is called, the clauses and docs.txt files have
346	# already been constructed in the Phind directory. This function will
347	# translate them into compressed, indexed MGPP files that can be read by
348	# the phindcgi script. It will also register our classifier so that it
349	# shows up in the navigation bar.
350
351	sub get_classify_info {
352	my $self = shift (@_);
353	my ($gli) = @_;
354
355	close $self->{'dochandle'};
356	close $self->{'txthandle'};
357	my $verbosity = $self->{'verbosity'};
358	my $out = $self->{'outhandle'};
359	my $phinddir = $self->{'phinddir'};
360
361	my $osextra = "";
362	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
363	$osextra = " -d /";
364	}
365
366	print STDERR "</Stage>\n" if $gli;
367
368	if ($verbosity) {
369	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
370	print $out "*** in ", $self->{'phinddir'}, "\n";
371	}
372
373	print STDERR "<Stage name='Phind'>\n" if $gli;
374
375	# Construct phind indexes
376	my $suffixmode = $self->{'suffixmode'};
377	my $min_occurs = $self->{'min_occurs'};
378	my ($command, $status);
379
380	# Generate the vocabulary, symbol statistics, and numbers file
381	# from the clauses file
382	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
383	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
384	&extract_vocabulary($self);
385
386	# Use the suffix program to generate the phind/phrases file
387	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
388	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
389	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
390
391	# check that we generated some files. It's not necessarily an error if
392	# we didn't (execute() would have quit on error), but we can't go on.
393	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
394	if (! -r $phrasesfile) {
395	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
396	print $out "\nNo phrases found for Phind classifier!\n";
397	return;
398	}
399
400	# Create the phrase file and put phrase numbers in phind/phrases
401	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
402	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
403	&renumber_phrases($self);
404
405	print $out "\nCreating phrase databases\n";
406	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
407	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
408	my $mg_stem = &util::filename_cat($phinddir, "pdata");
409
410	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
411	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
412	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
413
414	# create the mg index of words
415	print $out "\nCreating word-level search indexes\n";
416	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
417	$mg_input = &util::filename_cat($phinddir, "pword.txt");
418	$mg_stem = &util::filename_cat($phinddir, "pword");
419
420	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
421	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
422	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
423	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
424	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
425	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
426
427	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
428	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
429	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
430
431	# create the mg document information database
432	print $out "\nCreating document information databases\n";
433	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
434	$mg_input = &util::filename_cat($phinddir, "docs.txt");
435	$mg_stem = &util::filename_cat($phinddir, "docs");
436
437	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
438	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
439	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
440
441	# Return the information about the classifier that we'll later want to
442	# use to create macros when the Phind classifier document is displayed.
443	my %classifyinfo = ('thistype'=>'Invisible',
444	'childtype'=>'Phind',
445	'Title'=>$self->{'buttonname'},
446	'parameters'=>"phindnumber=$self->{'phindnumber'}",
447	'contains'=>[]);
448
449	my $collection = $self->{'collection'};
450	my $url = "library?a=p&p=phind&c=$collection";
451	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
452
453	return \%classifyinfo;
454	}
455
456
457
458	sub convert_gml_to_tokens {
459
460	my ($language_exp, $text) = @_;
461
462	# escape any magic words... - jrm21
463	foreach my $delim (@delimiters) {
464	my $replacement=lc($delim);
465	my $num= $text=~ s/$delim/$replacement/g;
466	if (!$num) {$num=0;}
467	}
468
469	if ($language_exp =~ /en/) {
470	return &convert_gml_to_tokens_EN($text);
471	}
472
473	if ($language_exp =~ /zh/) {
474	return &convert_gml_to_tokens_ZH($text);
475	}
476
477	$_ = $text;
478
479	# 1. remove GML tags
480
481	# Remove everything that is in a tag
482	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
483	s/\s<br>\s/ LINEBREAK /isgo;
484	s/<[^>]*>/ /sgo;
485
486	# Now we have the text, but it may contain HTML
487	# elements coded as > etc. Remove these tags.
488	s/&/&/sgo;
489	s/</</sgo;
490	s/>/>/sgo;
491	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
492	s/\s<br>\s/ LINEBREAK /isgo;
493	s/<[^>]*>/ /sgo;
494
495	# replace<p> and <br> placeholders with clause break symbol (\n)
496	s/\s+/ /gso;
497	s/PARAGRAPHBREAK/\n/sgo;
498	s/LINEBREAK/\n/sgo;
499
500
501
502
503	# 2. Split the remaining text into space-delimited tokens
504
505	# Convert any HTML special characters (like ") to their UTF8 equivalent
506	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
507
508	# Split text at word boundaries
509	s/\b/ /go;
510
511	# 3. Convert the remaining text to "clause format"
512
513	# Insert newline if the end of a sentence is detected
514	# (delimter is: "[\.\?\!]\s")
515	# s/\s*[\.\?\!]\s+/\n/go;
516
517	# remove unnecessary punctuation and replace with clause break symbol (\n)
518	s/[^\w ]/\n/go;
519
520	# remove extraneous whitespace
521	s/ +/ /sgo;
522	s/^\s+//mgo;
523	s/\s*$/\n/mgo;
524
525	# remove lines that contain one word or less
526	s/^\S*$//mgo;
527	s/^\s*$//mgo;
528	tr/\n//s;
529
530	return $_;
531	}
532
533	# a chinese version
534	sub convert_gml_to_tokens_ZH {
535
536	$_ = shift @_;
537
538	# Replace all whitespace with a simple space
539	s/\s+/ /gs;
540	# Remove everything that is in a tag
541	s/\s<p>\s/ PARAGRAPHBREAK /isg;
542	s/\s<br>\s/ LINEBREAK /isg;
543	s/<[^>]*>/ /sg;
544
545	# Now we have the text, but it may contain HTML
546	# elements coded as > etc. Remove these tags.
547	s/</</sg;
548	s/>/>/sg;
549
550	s/\s+/ /sg;
551	s/\s<p>\s/ PARAGRAPHBREAK /isg;
552	s/\s<br>\s/ LINEBREAK /isg;
553	s/<[^>]*>/ /sg;
554
555	# remove & and other miscellaneous markup tags
556	s/&/&/sg;
557	s/</</sg;
558	s/>/>/sg;
559	s/&/&/sg;
560
561	# replace<p> and <br> placeholders with carriage returns
562	s/PARAGRAPHBREAK/\n/sg;
563	s/LINEBREAK/\n/sg;
564
565
566	# print STDERR "text:$_\n";
567	return $_;
568	}
569
570	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
571
572	sub convert_gml_to_tokens_EN {
573	$_ = shift @_;
574
575	# FIRST, remove GML tags
576
577	# Replace all whitespace with a simple space
578	s/\s+/ /gs;
579
580	# Remove everything that is in a tag
581	s/\s<p>\s/ PARAGRAPHBREAK /isg;
582	s/\s<br>\s/ LINEBREAK /isg;
583	s/<[^>]*>/ /sg;
584
585	# Now we have the text, but it may contain HTML
586	# elements coded as > etc. Remove these tags.
587	s/</</sg;
588	s/>/>/sg;
589
590	s/\s+/ /sg;
591	s/\s<p>\s/ PARAGRAPHBREAK /isg;
592	s/\s<br>\s/ LINEBREAK /isg;
593	s/<[^>]*>/ /sg;
594
595	# remove & and other miscellaneous markup tags
596	s/&/&/sg;
597	s/</</sg;
598	s/>/>/sg;
599	s/&/&/sg;
600
601	# replace<p> and <br> placeholders with carriage returns
602	s/PARAGRAPHBREAK/\n/sg;
603	s/LINEBREAK/\n/sg;
604
605
606	# Exceptional punctuation
607	#
608	# We make special cases of some punctuation
609
610	# remove any apostrophe that indicates omitted letters
611	s/(\w+)\'(\w*\s)/ $1$2 /g;
612
613	# remove period that appears in a person's initals
614	s/\s([A-Z])\./ $1 /g;
615
616	# replace hyphens in hypheanted words and names with a space
617	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
618
619	# Convert the remaining text to "clause format",
620	# This means removing all excess punctuation and garbage text,
621	# normalising valid punctuation to fullstops and commas,
622	# then putting one cluse on each line.
623
624	# Insert newline when the end of a sentence is detected
625	# (delimter is: "[\.\?\!]\s")
626	s/\s*[\.\?\!]\s+/\n/g;
627
628	# split numbers after four digits
629	s/(\d\d\d\d)/$1 /g;
630
631	# split words after 32 characters
632
633	# squash repeated punctuation
634	tr/A-Za-z0-9 //cs;
635
636	# save email addresses
637	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
638
639	# normalise clause breaks (mostly punctuation symbols) to commas
640	s/[^A-Za-z0-9 \n]+/ , /g;
641
642	# Remove repeated commas, and replace with newline
643	s/\s*,[, ]+/\n/g;
644
645	# remove extra whitespace
646	s/ +/ /sg;
647	s/^\s+//mg;
648	s/\s*$/\n/mg;
649
650	# remove lines that contain one word or less
651	s/^\w*$//mg;
652	s/^\s*$//mg;
653	tr/\n//s;
654
655	return $_;
656
657	}
658
659
660
661	# Execute a system command
662
663	sub execute {
664	my ($command, $verbosity, $outhandle) = @_;
665	print $outhandle "Executing: $command\n" if ($verbosity > 2);
666	$! = 0;
667	my $status = system($command);
668	if ($status != 0) {
669	print STDERR "Phind - Error executing '$command': $!\n";
670	exit($status); # this causes the build to fail...
671	}
672	}
673
674
675	# Generate the vocabulary, symbol statistics, and numbers file from the
676	# clauses file. This is legacy code, so is a bit messy and probably wont
677	# run under windows.
678
679	sub extract_vocabulary {
680	my ($self) = @_;
681
682	my $verbosity = $self->{'verbosity'};
683	my $out = $self->{'outhandle'};
684
685	my $collectiondir = $self->{'collectiondir'};
686	my $phinddir = $self->{'phinddir'};
687
688	my $language_exp = $self->{'language'};
689
690	my ($w, $l, $line, $word);
691
692	my ($first_delimiter, $last_delimiter,
693	$first_stopword, $last_stopword,
694	$first_extractword, $last_extractword,
695	$first_contentword, $last_contentword,
696	$phrasedelimiter);
697
698	my $thesaurus = $self->{'thesaurus'};
699	my ($thesaurus_links, $thesaurus_terms,
700	%thesaurus, $first_thesaurusword, $last_thesaurusword);
701
702	my %symbol;
703	my (%freq);
704
705	print $out "Calculating vocabulary\n" if ($verbosity > 1);
706
707	# Read and store the stopwords
708	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
709	my $stopword_files = ();
710	my ($language, $language_dir, $file, $file_name);
711	my %stopwords;
712
713	# Examine each directory in the stopword directory
714	opendir(STOPDIR, $stopdir);
715	foreach $language (readdir STOPDIR) {
716
717	# Ignore entries that do not match the classifier's language
718	next unless ($language =~ /$language_exp/);
719	$language_dir = &util::filename_cat($stopdir, $language);
720	next unless (-d "$language_dir");
721
722	opendir(LANGDIR, $language_dir);
723	foreach $file (readdir LANGDIR) {
724
725	# Ignore entries that are not stopword files
726	next unless ($file =~ /sw$/);
727	$file_name = &util::filename_cat($language_dir, $file);
728	next unless (-f "$file_name");
729
730	# Read the stopwords
731	open(STOPFILE, "<$file_name");
732	while (<STOPFILE>) {
733	s/^\s+//;
734	s/\s.*//;
735	$word = $_;
736	$l = lc($word);
737	$stopwords{$l} = $word;
738	}
739	close STOPFILE;
740
741	}
742	closedir LANGDIR;
743	}
744	closedir STOPDIR;
745
746	# Read thesaurus information
747	if ($thesaurus) {
748
749	# link file exists
750	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
751	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
752
753	# ensure term file exists in the correct language
754	if ($language_exp =~ /^([a-z][a-z])/) {
755	$language = $1;
756	} else {
757	$language = 'en';
758	}
759	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
760	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
761
762
763	# Read the thesaurus terms
764	open(TH, "<$thesaurus_terms");
765	while(<TH>) {
766	s/^\d+ //;
767	s/\(.*\)//;
768	foreach $w (split(/\s+/, $_)) {
769	$thesaurus{lc($w)} = $w;
770	}
771	}
772	close TH;
773	}
774
775	# Read words in the text and count occurences
776	open(TXT, "<$phinddir/clauses");
777
778	my @words;
779	while(<TXT>) {
780	$line = $_;
781	next unless ($line =~ /./);
782
783	@words = split(/\s+/, $line);
784	foreach $w (@words) {
785	$l = lc($w);
786	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
787	$freq{$w}++;
788	}
789	$freq{$senlimit}++;
790	}
791
792	close TXT;
793
794	# Calculate the "best" form of each word
795	my (%bestform, %totalfreq, %bestfreq);
796
797	foreach $w (sort (keys %freq)) {
798	$l = lc($w);
799
800	# totalfreq is the number of times a term appears in any form
801	$totalfreq{$l} += $freq{$w};
802
803	if (defined $stopwords{$l}) {
804	$bestform{$l} = $stopwords{$l};
805
806	} elsif (defined $thesaurus{$l}) {
807	$bestform{$l} = $thesaurus{$l};
808
809	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
810	$bestfreq{$l} = $freq{$w};
811	$bestform{$l} = $w;
812	}
813	}
814	undef %freq;
815	undef %bestfreq;
816
817
818	# Assign symbol numbers to tokens
819	my $nextsymbol = 1;
820	my (@vocab);
821
822	# Delimiters
823	$first_delimiter = 1;
824
825	foreach $word (@delimiters) {
826
827	# $word = lc($word); # jrm21
828	$word = uc($word);
829	$bestform{$word} = $word;
830	$vocab[$nextsymbol] = $word;
831	$symbol{$word} = $nextsymbol;
832	$nextsymbol++;
833	}
834	$last_delimiter = $nextsymbol - 1;
835	# Stopwords
836	$first_stopword = $nextsymbol;
837
838	foreach my $word (sort keys %stopwords) {
839	# don't include stopword unless it occurs in the text
840	$word = lc($word);
841	next unless ($totalfreq{$word});
842	next if ($symbol{$word});
843
844	$vocab[$nextsymbol] = $word;
845	$symbol{$word} = $nextsymbol;
846	$nextsymbol++;
847	}
848	$last_stopword = $nextsymbol - 1;
849	$first_contentword = $nextsymbol;
850
851	# Thesaurus terms
852	if ($thesaurus) {
853	$first_thesaurusword = $nextsymbol;
854
855	foreach my $word (sort keys %thesaurus) {
856
857	$word = lc($word);
858	next if ($symbol{$word});
859	$bestform{$word} = $thesaurus{$word};
860
861	$vocab[$nextsymbol] = $word;
862	$symbol{$word} = $nextsymbol;
863	$nextsymbol++;
864
865	}
866	$last_thesaurusword = $nextsymbol - 1;
867	}
868
869	# Other content words
870	$first_extractword = $nextsymbol;
871
872	foreach my $word (sort (keys %bestform)) {
873
874	next if ($symbol{$word});
875
876	$vocab[$nextsymbol] = $word;
877	$symbol{$word} = $nextsymbol;
878	$nextsymbol++;
879	}
880	$last_extractword = $nextsymbol - 1;
881	$last_contentword = $nextsymbol - 1;
882
883	# Outut the words
884	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
885	open(VOC, ">$phinddir/clauses.vocab");
886
887	for (my $i = 1; $i < $nextsymbol; $i++) {
888	$w = $vocab[$i];
889
890	print VOC "$bestform{$w}\n";
891	$totalfreq{$w} = 0 unless ($totalfreq{$w});
892	}
893	close VOC;
894
895
896	# Create statistics file
897	# Output statistics about the vocablary
898	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
899	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
900
901	open(STAT, ">$phinddir/clauses.stats")
902	\|\| die "Cannot open $phinddir/clauses.stats: $!";
903
904	print STAT "first_delimiter $first_delimiter\n";
905	print STAT "last_delimiter $last_delimiter\n";
906	print STAT "first_stopword $first_stopword\n";
907	print STAT "last_stopword $last_stopword\n";
908	if ($thesaurus) {
909	print STAT "first_thesaurusword $first_thesaurusword\n";
910	print STAT "last_thesaurusword $last_thesaurusword\n";
911	}
912	print STAT "first_extractword $first_extractword\n";
913	print STAT "last_extractword $last_extractword\n";
914	print STAT "first_contentword $first_contentword\n";
915	print STAT "last_contentword $last_contentword\n";
916	print STAT "first_symbol $first_delimiter\n";
917	print STAT "last_symbol $last_contentword\n";
918	print STAT "first_word $first_stopword\n";
919	print STAT "last_word $last_contentword\n";
920	close STAT;
921
922	undef @vocab;
923
924
925	# Create numbers file
926	# Save text as symbol numbers
927	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
928
929	open(TXT, "<$phinddir/clauses");
930	open(NUM, ">$phinddir/clauses.numbers");
931
932	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
933	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
934	$phrasedelimiter = $symbol{$senlimit};
935	print NUM "$symbol{$colstart}\n";
936
937	# set up the special symbols that delimit documents and sentences
938	while(<TXT>) {
939
940	# split sentence into a list of tokens
941	$line = $_;
942	next unless ($line =~ /./);
943	@words = split(/\s+/, $line);
944
945	# output one token at a time
946	foreach $word (@words) {
947	# don't lower-case special delimiters - jrm21
948	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
949	$word = lc($word);
950	}
951	print NUM "$symbol{$word}\n";
952	}
953
954	# output phrase delimiter
955	print NUM "$phrasedelimiter\n";
956	}
957
958	close TXT;
959	# print NUM "$symbol{lc($colend)}\n";# jrm21
960	print NUM "$symbol{$colend}\n";
961	close NUM;
962
963	# Save thesaurus data in one convienient file
964	if ($thesaurus) {
965
966	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
967
968
969	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
970	if ($verbosity > 1);
971
972	# Read the thesaurus terms
973	my ($num, $text, %thes_symbols);
974
975	open(TH, "<$thesaurus_terms");
976	while(<TH>) {
977	chomp;
978	@words = split(/\s+/, $_);
979	$num = shift @words;
980	$text = "";
981
982	# translate words into symbol numbers
983	foreach $word (@words) {
984	$word = lc($word);
985	if ($symbol{$word}) {
986	$text .= "s$symbol{$word} ";
987	} elsif ($verbosity) {
988	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
989	}
990	}
991	$text =~ s/ $//;
992	$thes_symbols{$num} = $text;
993	}
994	close TH;
995
996	# Read the thesaurus links and write the corresponding data
997	open(TH, "<$thesaurus_links");
998	open(THOUT, ">$thesaurusfile");
999
1000	while(<TH>) {
1001	chomp;
1002	($num, $text) = split(/:/, $_);
1003
1004	if (defined($thes_symbols{$num})) {
1005	print THOUT "$num:$thes_symbols{$num}:$text\n";
1006	} else {
1007	print THOUT "$num:untranslated:$text\n";
1008	}
1009	}
1010	close TH;
1011	close THOUT;
1012	}
1013
1014
1015
1016
1017	}
1018
1019
1020	# renumber_phrases
1021	#
1022	# Prepare the phrases file to be input to mgpp. The biggest problem is
1023	# reconciling the phrase identifiers used by the suffix program (which
1024	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1025	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1026	# Phind-id numbers must be sorted by frequency of occurance.
1027	#
1028	# Start creating a set of phind-id numbers from the sorted suffix-id
1029	# numbers and (if required) the thesaurus-id numbers. Then add any other
1030	# phrases occuring in the thesaurus.
1031	#
1032	# The last thing we have to do is restore the vocabulary information to the
1033	# phrase file so that the phrases are stored as words, not as symbol
1034	# numbers.
1035
1036	# The original phrases file looks something like this:
1037	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1038	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1039	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1040	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1041
1042
1043	sub renumber_phrases {
1044	my ($self) = @_;
1045
1046	renumber_suffix_data($self);
1047	renumber_thesaurus_data($self);
1048	restore_vocabulary_data($self);
1049
1050	}
1051
1052
1053
1054	# renumber_suffix_data
1055	#
1056	# Translate phrases file to phrases.2 using phind keys instead
1057	# of suffix keys and sorting the expansion data.
1058
1059	sub renumber_suffix_data {
1060	my ($self) = @_;
1061
1062	my $verbosity = $self->{'verbosity'};
1063	my $out = $self->{'outhandle'};
1064	print $out "Translate phrases: suffix-ids become phind-id's\n"
1065	if ($verbosity);
1066
1067	my $phinddir = $self->{'phinddir'};
1068	my $infile = &util::filename_cat($phinddir, 'phrases');
1069	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1070
1071	# Read the phrase file. Calculate initial set of phind-id
1072	# numbers and store (suffixid -> frequency) relation.
1073
1074	my %suffixtophind;
1075	my @totalfrequency;
1076	my (@fields, $suffixid);
1077	my $nextphind = 1;
1078
1079	open(IN, "<$infile");
1080	while(<IN>) {
1081
1082	chomp;
1083	@fields = split(/:/, $_);
1084
1085	# get next suffixid and phindid
1086	$suffixid = shift @fields;
1087	$suffixtophind{$suffixid} = $nextphind;
1088
1089	# store total frequency
1090	shift @fields;
1091	$totalfrequency[$nextphind] = shift @fields;
1092
1093	$nextphind++;
1094	}
1095	close IN;
1096
1097
1098	# Translate phrases file to phrases.2. Use phind keys (not suffix
1099	# keys), sort expansion and document occurance data in order of
1100	# descending frequency..
1101	open(IN, "<$infile");
1102	open(OUT, ">$outfile");
1103
1104	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1105	my (@documwents, @newexp, $k, $n);
1106	my $linenumber = 0;
1107
1108	while(<IN>) {
1109
1110	# read the line
1111	chomp;
1112	@fields = split(/:/, $_);
1113
1114	# get a phrase number for this line
1115	$suffixid = shift @fields;
1116	die unless (defined($suffixtophind{$suffixid}));
1117	$phindid = $suffixtophind{$suffixid};
1118
1119	# get the symbols in the phrase
1120	$text = shift @fields;
1121
1122	# output status information
1123	$linenumber++;
1124	if ($verbosity > 2) {
1125	if ($linenumber % 1000 == 0) {
1126	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1127	}
1128	# what are $num and $key??
1129	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1130	}
1131
1132	# get the phrase frequency
1133	$tf = shift @fields;
1134
1135	# get the number of expansions
1136	$countexp = shift @fields;
1137
1138	# get the expansions, convert them into phind-id numbers, and sort them
1139	$expansions = shift @fields;
1140	@newexp = ();
1141	foreach $k (split(/,/, $expansions)) {
1142	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1143	$n = $suffixtophind{$k};
1144	push @newexp, $n;
1145	}
1146	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1147
1148	# get the number of documents
1149	$countdocs = shift @fields;
1150
1151	# get the documents and sort them
1152	$documents = shift @fields;
1153	$documents =~ s/d//g;
1154	my @documents = split(/;/, $documents);
1155	@documents = sort by_doc_frequency @documents;
1156
1157	# output the phrase data
1158	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1159	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1160
1161	}
1162
1163	close IN;
1164	close OUT;
1165	}
1166
1167
1168	# renumber_thesaurus_data
1169	#
1170	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1171
1172	sub renumber_thesaurus_data {
1173	my ($self) = @_;
1174
1175	my $out = $self->{'outhandle'};
1176	my $verbosity = $self->{'verbosity'};
1177	my $thesaurus = $self->{'thesaurus'};
1178
1179	my $phinddir = $self->{'phinddir'};
1180	my $infile = &util::filename_cat($phinddir, "phrases.2");
1181	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1182
1183
1184	# If no thesaurus is defined, simply move the phrases file.
1185	if (!$thesaurus) {
1186	print $out "Translate phrases.2: no thesaurus data\n"
1187	if ($verbosity);
1188	&util::mv($infile, $outfile);
1189	return;
1190	}
1191
1192	print $out "Translate phrases.2: add thesaurus data\n"
1193	if ($verbosity);
1194
1195	# 1.
1196	# Read thesaurus file and store (symbols->thesaurusid) mapping
1197	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1198	my %symbolstothesid;
1199	my (@fields, $thesid, $symbols);
1200
1201	open(TH, "<$thesaurusfile");
1202
1203	while (<TH>) {
1204
1205	chomp;
1206	@fields = split(/:/, $_);
1207
1208	# get id and text
1209	$thesid = shift @fields;
1210	$symbols = shift @fields;
1211	$symbolstothesid{$symbols} = $thesid;
1212	}
1213	close TH;
1214
1215	# 2.
1216	# Read phrases file to find thesaurus entries that already
1217	# have a phindid. Store their phind-ids for later translation,
1218	# and store their frequency for later sorting.
1219	my %thesaurustophindid;
1220	my %phindidtofrequency;
1221	my ($phindid, $freq);
1222
1223	open(IN, "<$infile");
1224
1225	while(<IN>) {
1226
1227	chomp;
1228	@fields = split(/:/, $_);
1229
1230	# phindid and symbols for this line
1231	$phindid = shift @fields;
1232	$symbols = shift @fields;
1233	$freq = shift @fields;
1234
1235	# do we have a thesaurus id corresponding to this phrase?
1236	if (defined($symbolstothesid{$symbols})) {
1237	$thesid = $symbolstothesid{$symbols};
1238	$thesaurustophindid{$thesid} = $phindid;
1239	$phindidtofrequency{$phindid} = $freq;
1240	}
1241	}
1242	close IN;
1243
1244	undef %symbolstothesid;
1245
1246	# 3.
1247	# Create phind-id numbers for remaining thesaurus entries,
1248	# and note that their frequency is 0 for later sorting.
1249	my $nextphindid = $phindid + 1;
1250
1251	open(TH, "<$thesaurusfile");
1252	while(<TH>) {
1253
1254	chomp;
1255	@fields = split(/:/, $_);
1256
1257	# read thesaurus-id and ensure it has a corresponding phind-id
1258	$thesid = shift @fields;
1259	if (!defined($thesaurustophindid{$thesid})) {
1260	$thesaurustophindid{$thesid} = $nextphindid;
1261	$phindidtofrequency{$nextphindid} = 0;
1262	$nextphindid++;
1263	}
1264	}
1265	close TH;
1266
1267	# 4.
1268	# Translate thesaurus file, replacing thesaurus-id numbers with
1269	# phind-id numbers.
1270	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1271	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1272	my (@links, $linkid, %linkidtotype, $newrelation);
1273
1274	open(TH, "<$thesaurusfile");
1275	open(TO, ">$newthesaurusfile");
1276	while(<TH>) {
1277
1278	chomp;
1279	@fields = split(/:/, $_);
1280
1281	# phindid and symbols for this line
1282	($thesid, $symbols, $relations) = @fields;
1283
1284	die unless ($thesid && $symbols);
1285	die unless $thesaurustophindid{$thesid};
1286	$phindid = $thesaurustophindid{$thesid};
1287
1288	# convert each part of the relation string to use phind-id numbers
1289	# at the same time, we want to sort the list by frequency.
1290	undef %linkidtotype;
1291
1292	foreach $linktext (split(/;/, $relations)) {
1293	@linkdata = split(/,/, $linktext);
1294
1295	# remember the linktype (e.g. BT, NT)
1296	$linktype = shift @linkdata;
1297
1298	# store the type of each link
1299	foreach $thesid (@linkdata) {
1300	die unless (defined($thesaurustophindid{$thesid}));
1301	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1302	}
1303	}
1304
1305	# sort the list of links, first by frequency, then by type.
1306	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1307	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1308	$linkcounter = (scalar @links);
1309
1310	# create a string describing the link information
1311	$linktype = $linkidtotype{$links[0]};
1312	$newrelation = $linktype;
1313	foreach $linkid (@links) {
1314	if ($linkidtotype{$linkid} ne $linktype) {
1315	$linktype = $linkidtotype{$linkid};
1316	$newrelation .= ";" . $linktype;
1317	}
1318	$newrelation .= "," . $linkid;
1319	}
1320	$newrelation .= ";";
1321
1322
1323	# output the new line
1324	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1325	}
1326	close TH;
1327	close TO;
1328
1329	undef %thesaurustophindid;
1330	undef %linkidtotype;
1331	undef %phindidtofrequency;
1332
1333	# 5.
1334	# Read thesaurus data (in phind-id format) into memory
1335	my %thesaurusdata;
1336
1337	open(TH, "<$newthesaurusfile");
1338	while(<TH>) {
1339	chomp;
1340	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1341	die unless ($phindid && $symbols);
1342	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1343	}
1344	close TH;
1345
1346	# 6.
1347	# Add thesaurus data to phrases file
1348	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1349	my (@documwents, @newexp, $k, $n);
1350	my $linenumber = 0;
1351
1352	open(IN, "<$infile");
1353	open(OUT, ">$outfile");
1354
1355	# Update existing phrases
1356	while(<IN>) {
1357
1358	chomp;
1359	@fields = split(/:/, $_);
1360
1361	# get data for this line
1362	$phindid = shift @fields;
1363
1364	# output the phrase data, with thesaurus information
1365	print OUT "$phindid:", join(":", @fields);
1366
1367	# add thesaurus data
1368	if (defined($thesaurusdata{$phindid})) {
1369	@fields = split(/:/, $thesaurusdata{$phindid});
1370	shift @fields;
1371	$linkcounter = shift @fields;
1372	$relations = shift @fields;
1373
1374	print OUT ":$linkcounter:$relations";
1375	$thesaurusdata{$phindid} = "";
1376	}
1377	print OUT "\n";
1378	}
1379	close IN;
1380
1381	# Add phrases that aren't already in the file
1382	foreach $phindid (sort numerically keys %thesaurusdata) {
1383	next unless ($thesaurusdata{$phindid});
1384
1385	@fields = split(/:/, $thesaurusdata{$phindid});
1386	$symbols = shift @fields;
1387	$linkcounter = shift @fields;
1388	$relations = shift @fields;
1389
1390	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1391	}
1392	close OUT;
1393
1394	}
1395
1396	# restore_vocabulary_data
1397	#
1398	# Read phrases.3 and restore vocabulary information. Then write
1399	# this data to the MGPP input files (pword.txt and pdata.txt) and
1400	# (if requested) to the saved phrases file.
1401
1402	sub restore_vocabulary_data {
1403	my ($self) = @_;
1404
1405	my $out = $self->{'outhandle'};
1406	my $verbosity = $self->{'verbosity'};
1407	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1408
1409	my $phinddir = $self->{'phinddir'};
1410	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1411	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1412	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1413	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1414
1415	my $savephrases = $self->{'savephrases'};
1416
1417	# 1.
1418	# Read the vocabulary file
1419	open(V, "<$vocabfile")
1420	\|\| die "Cannot open $vocabfile: $!";
1421	my @symbol;
1422	my $i = 1;
1423	while(<V>) {
1424	chomp;
1425	$symbol[$i++] = $_;
1426	}
1427	close V;
1428
1429	# 2.
1430	# Translate phrases.3 to MGPP input files
1431	my ($key, $text, $word, $isThesaurus, $line);
1432	my @fields;
1433	my $linenumber = 0;
1434
1435	open(IN, "<$infile");
1436	open(DATA, ">$datafile");
1437	open(WORD, ">$wordfile");
1438
1439	# Save the phrases in a separate text file
1440	if ($savephrases) {
1441	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1442	open(SAVE, ">$savephrases");
1443	}
1444
1445	while(<IN>) {
1446
1447	# read the line
1448	chomp;
1449	$line = $_;
1450	@fields = split(/:/, $line);
1451
1452	# get a phrase number for this line
1453	$key = shift @fields;
1454
1455	# restore the text of the phrase
1456	$text = shift @fields;
1457	$text =~ s/s(\d+)/$symbol[$1]/g;
1458	if ($text =~ / /) {
1459	$word = "";
1460	} elsif ($text ne 'untranslated') {
1461	$word = $text;
1462	}
1463
1464	# output the phrase data
1465	print DATA "<Document>";
1466	print DATA "$key:$text:", join(":", @fields), ":\n";
1467
1468	# output the word index search data
1469	print WORD "<Document>$word\n";
1470
1471	# output the phrases to a text file
1472	if ($savephrases) {
1473	if ((scalar @fields) == 7) {
1474	$isThesaurus = 1;
1475	} else {
1476	$isThesaurus = 0;
1477	}
1478	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1479	}
1480	}
1481	close IN;
1482	close WORD;
1483	close DATA;
1484	close SAVE if ($savephrases);
1485
1486	}
1487
1488
1489
1490	# sort routines used to renumber phrases
1491
1492	sub numerically { $a <=> $b }
1493
1494	sub by_doc_frequency {
1495	my $fa = 1;
1496	if ($a =~ /,/) {
1497	$fa = $a;
1498	$fa =~ s/\d+,//;
1499	}
1500	my $fb = 1;
1501	if ($b =~ /,/) {
1502	$fb = $b;
1503	$fb =~ s/\d+,//;
1504	}
1505
1506	return ($fb <=> $fa);
1507	}
1508
1509	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: