Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 5645

Last change on this file since 5645 was 5645, checked in by mdewsnip, 21 years ago
Moved classifier descriptions into the resource bundle (perllib/strings.rb).
Property svn:keywords set to `Author Date Id Revision`
File size: 41.3 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BasClas;
37	use util;
38	use ghtml;
39	use unicode;
40
41	my @removedirs = ();
42
43	my %wanted_index_files = ('td'=>1,
44	't'=>1,
45	'ti'=>1,
46	'tl'=>1,
47	'tsd'=>1,
48	'idb'=>1,
49	'ib1'=>1,
50	'ib2'=>1,
51	'ib3'=>1,
52	'i'=>1,
53	'il'=>1,
54	'w'=>1,
55	'wa'=>1);
56
57	sub BEGIN {
58	@ISA = ('BasClas');
59	}
60
61	sub END {
62
63	# Tidy up stray files - we do this here as there's some weird problem
64	# preventing us from doing it in the get_classify_info() function (on
65	# windows at least) where the close() appears to fail on txthandle and
66	# dochandle, thus preventing us from deleting those files
67
68	foreach my $dir (@removedirs) {
69	if (-d $dir && opendir (DIR, $dir)) {
70	my @files = readdir DIR;
71	closedir DIR;
72
73	foreach $file (@files) {
74	next if $file =~ /^\.\.?$/;
75	my ($suffix) = $file =~ /\.([^\.]+)$/;
76	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
77	# delete it!
78	&util::rm (&util::filename_cat ($dir, $file));
79	}
80	}
81	}
82	}
83	}
84
85	my $arguments =
86	[ { 'name' => "text",
87	'desc' => "{Phind.text}",
88	'type' => "string",
89	'deft' => "section:Title,section:text",
90	'reqd' => "no" },
91	{ 'name' => "title",
92	'desc' => "{Phind.title}",
93	'type' => "metadata",
94	'deft' => "Title",
95	'reqd' => "no" },
96	{ 'name' => "button",
97	'desc' => "{Phind.button}",
98	'type' => "string",
99	'deft' => "Phrase",
100	'reqd' => "no" },
101	{ 'name' => "language",
102	'desc' => "{Phind.language}",
103	'type' => "language",
104	'deft' => "en",
105	'reqd' => "no" },
106	{ 'name' => "savephrases",
107	'desc' => "{Phind.savephrases}",
108	'type' => "string",
109	'deft' => "",
110	'reqd' => "no" },
111	{ 'name' => "builddir",
112	'desc' => "{BasClas.builddir}",
113	'type' => "string",
114	'deft' => "" },
115	{ 'name' => "suffixmode",
116	'desc' => "{Phind.suffixmode}",
117	'type' => "int",
118	'deft' => "1",
119	'reqd' => "no" },
120	{ 'name' => "thesaurus",
121	'desc' => "{Phind.thesaurus}",
122	'type' => "string",
123	'deft' => "",
124	'reqd' => "no" },
125	{ 'name' => "untidy",
126	'desc' => "{Phind.untidy}",
127	'type' => "flag",
128	'reqd' => "no" } ];
129
130	my $options = { 'name' => "Phind",
131	'desc' => "{Phind.desc}",
132	'inherits' => "Yes",
133	'args' => $arguments };
134
135	# sub print_usage {
136	# print STDERR "
137	# usage: classify Phind [options]
138
139	# options:
140	# -text Fields The text used to build the phrase hierarchy.
141	# (default: 'section:Title,section:text')
142
143	# -title Title The metadata field used to describe each document.
144	# (default: 'Title')
145
146	# -button Name The label for the classifier screen and button in
147	# navigation bar.
148	# (default: 'Phrase')
149
150	# -language Regex Language or languages to use building hierarchy.
151	# Languages are identified by two-letter country codes
152	# like en (English), es (Spanish), and fr (French).
153	# Language is a regular expression, so 'en\|fr' (English or
154	# French) and '..' (match any language) are valid.
155	# (default: 'en'.)
156
157	# -savephrases File If set, the phrase infomation will be stored in
158	# the given file as text. It is probably a good idea
159	# to use an absolute path.
160	# (default: not set)
161
162	# -suffixmode N The smode parameter to the phrase extraction program. A
163	# value of 0 means that stopwords are ignored, and of 1
164	# means that stopwords are used.
165	# (default: 1)
166
167	# -thesaurus Name Name of a thesaurus stored in Phind format in the
168	# collection's etc directory.
169	# (default: not set)
170
171	# -untidy Don't remove working files.
172
173	# "; }
174
175	# Phrase delimiter symbols - these should be abstracted out someplace
176
177	my $colstart = "COLLECTIONSTART";
178	my $colend = "COLLECTIONEND";
179	my $doclimit = "DOCUMENTLIMIT";
180	my $senlimit = "SENTENCELIMIT";
181	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
182
183
184	# Create a new Phind browser based on collect.cfg
185
186	sub new {
187	my $class = shift (@_);
188	my $self = new BasClas($class, @_);
189
190	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
191	my $option_list = $self->{'option_list'};
192	push( @{$option_list}, $options );
193
194	my $out = $self->{'outhandle'};
195
196	# Ensure the Phind generate scripts are in place
197	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
198	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
199	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
200
201	if (!(-e $file1)) {
202	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
203	exit(1);
204	}
205
206	# Parse classifier arguments
207	my $builddir = "";
208	if (!parsargv::parse(\@_,
209	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
210	q^title/.*/Title^, \$self->{'titlefield'},
211	q^button/.*/Phrase^, \$self->{'buttonname'},
212	q^language/.*/en^, \$language,
213	q^builddir/.*/^, \$builddir,
214	q^savephrases/.*/^, \$self->{'savephrases'},
215	q^suffixmode/\d/1^, \$self->{'suffixmode'},
216	q^thesaurus/.*/^, \$self->{'thesaurus'},
217	q^untidy^, \$self->{'untidy'},
218	"allow_extra_options")) {
219
220	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
221	$self->print_txt_usage(""); # Use default resource bundle
222	die "\n";
223	}
224
225	# classifier information
226	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
227
228	# limit languages
229	$self->{'language_exp'} = $language;
230
231	# collection directories
232	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
233	if (!$builddir) {
234	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
235	}
236	$self->{'builddir'} = $builddir;
237
238	$self->{'total'} = 0;
239
240	return bless $self, $class;
241	}
242
243
244	# Initialise the Phind classifier
245
246	sub init {
247	my $self = shift (@_);
248
249	# ensure we have a build directory
250	my $builddir = $self->{'builddir'};
251	die unless (-e "$builddir");
252
253	# create Phind directory
254	my $phnumber = 1;
255	my $phinddir = &util::filename_cat($builddir, "phind1");
256	while (-e "$phinddir") {
257	$phnumber++;
258	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
259	}
260	&util::mk_dir("$phinddir");
261	$self->{'phinddir'} = $phinddir;
262	$self->{'phindnumber'} = $phnumber;
263
264	push(@removedirs, $phinddir) unless $self->{'untidy'};
265
266	# open filehandles for documents and text
267	my $clausefile = &util::filename_cat("$phinddir", "clauses");
268	&util::rm($clausefile) if (-e $clausefile);
269
270	my $txthandle = 'TEXT' . $phnumber;
271	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
272	$self->{'txthandle'} = $txthandle;
273
274	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
275	&util::rm($docfile) if (-e $docfile);
276
277	my $dochandle = 'DOC' . $phnumber;
278	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
279	$self->{'dochandle'} = $dochandle;
280
281	}
282
283
284	# Classify each document.
285	#
286	# Each document is passed here in turn. The classifier extracts the
287	# text of each and stores it in the clauses file. Document details are
288	# stored in the docs.txt file.
289
290	sub classify {
291	my $self = shift (@_);
292	my $doc_obj = shift @_;
293
294	my $verbosity = $self->{'verbosity'};
295	my $top_section = $doc_obj->get_top_section();
296
297	my $titlefield = $self->{'titlefield'};
298
299	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
300	if (!defined($title)) {
301	$title = "";
302	print STDERR "Phind: document has no title\n";
303	}
304	print "process: $title\n" if ($verbosity > 2);
305
306	# Only consider the file if it is in the correct language
307	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
308	my $phrlanguage = $self->{'language_exp'};
309	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
310
311	# record this file
312	$self->{'total'} ++;
313	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
314
315
316	# Store document details
317	my $OID = $doc_obj->get_OID();
318	$OID = "NULL" unless defined $OID;
319	my $dochandle = $self->{'dochandle'};
320	print $dochandle "<Document>\t$OID\t$title\n";
321
322	# Store the text occuring in this object
323
324	# output the document delimiter
325	my $txthandle = $self->{'txthandle'};
326	print $txthandle "$doclimit\n";
327
328	# iterarate over the required indexes and store their text
329	my $indexes = $self->{'indexes'};
330	my $text = "";
331	my ($part, $level, $field, $section, $data, $dataref);
332
333	foreach $part (split(/,/, $indexes)) {
334
335	# Each field has a level and a data element ((e.g. document:Title)
336	($level, $field) = split(/:/, $part);
337	die unless ($level && $field);
338
339	# Extract the text from every section
340	# (In phind, document:text and section:text are equivalent)
341	if ($field eq "text") {
342	$data = "";
343	$section = $doc_obj->get_top_section();
344	while (defined($section)) {
345	$data .= $doc_obj->get_text($section) . "\n";
346	$section = $doc_obj->get_next_section($section);
347	}
348	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
349	}
350
351	# Extract a metadata field from a document
352	# (If there is more than one element of the given type, get them all.)
353	elsif ($level eq "document") {
354	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
355	foreach $data (@$dataref) {
356	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
357	}
358	}
359
360	# Extract metadata from every section in a document
361	elsif ($level eq "section") {
362	$data = "";
363	$section = $doc_obj->get_top_section();
364	while (defined($section)) {
365	$dataref = $doc_obj->get_metadata($section, $field);
366	$data .= join("\n", @$dataref) . "\n";
367	$section = $doc_obj->get_next_section($section);
368	}
369	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
370	}
371
372	# Some sort of specification which I don't understand
373	else {
374	die "Unknown level ($level) in Phind index ($part)\n";
375	}
376
377	}
378
379	# output the text
380	$text =~ tr/\n//s;
381	print $txthandle "$text";
382	}
383
384
385	# Construct the classifier from the information already gathered
386	#
387	# When get_classify_info is called, the clauses and docs.txt files have
388	# already been constructed in the Phind directory. This function will
389	# translate them into compressed, indexed MGPP files that can be read by
390	# the phindcgi script. It will also register our classifier so that it
391	# shows up in the navigation bar.
392
393	sub get_classify_info {
394	my $self = shift (@_);
395
396	close $self->{'dochandle'};
397	close $self->{'txthandle'};
398	my $verbosity = $self->{'verbosity'};
399	my $out = $self->{'outhandle'};
400	my $phinddir = $self->{'phinddir'};
401
402	my $osextra = "";
403	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
404	$osextra = " -d /";
405	}
406
407	if ($verbosity) {
408	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
409	print $out "*** in ", $self->{'phinddir'}, "\n";
410	}
411
412	# Construct phind indexes
413	my $suffixmode = $self->{'suffixmode'};
414	my ($command, $status);
415
416	# Generate the vocabulary, symbol statistics, and numbers file
417	# from the clauses file
418	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
419	&extract_vocabulary($self);
420
421	# Use the suffix program to generate the phind/phrases file
422	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
423	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
424
425	# check that we generated some files. It's not necessarily an error if
426	# we didn't (execute() would have quit on error), but we can't go on.
427	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
428	if (! -r $phrasesfile) {
429	print $out "\nNo phrases found for Phind classifier!\n";
430	return;
431	}
432
433	# Create the phrase file and put phrase numbers in phind/phrases
434	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
435	&renumber_phrases($self);
436
437	print $out "\nCreating phrase databases\n";
438	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
439	my $mg_stem = &util::filename_cat($phinddir, "pdata");
440
441	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
442	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
443	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
444
445	# create the mg index of words
446	print $out "\nCreating word-level search indexes\n";
447	$mg_input = &util::filename_cat($phinddir, "pword.txt");
448	$mg_stem = &util::filename_cat($phinddir, "pword");
449
450	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
451	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
452	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
453	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
454	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
455	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
456
457	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
458	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
459	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
460
461	# create the mg document information database
462	print $out "\nCreating document information databases\n";
463	$mg_input = &util::filename_cat($phinddir, "docs.txt");
464	$mg_stem = &util::filename_cat($phinddir, "docs");
465
466	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
467	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
468	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
469
470	# Return the information about the classifier that we'll later want to
471	# use to create macros when the Phind classifier document is displayed.
472	my %classifyinfo = ('thistype'=>'Invisible',
473	'childtype'=>'Phind',
474	'Title'=>$self->{'buttonname'},
475	'parameters'=>"phindnumber=$self->{'phindnumber'}",
476	'contains'=>[]);
477
478	my $collection = $self->{'collection'};
479	my $url = "library?a=p&p=phind&c=$collection";
480	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
481
482	return \%classifyinfo;
483	}
484
485
486
487	sub convert_gml_to_tokens {
488
489	my ($language_exp, $text) = @_;
490
491	# escape any magic words... - jrm21
492	foreach my $delim (@delimiters) {
493	my $replacement=lc($delim);
494	my $num= $text=~ s/$delim/$replacement/g;
495	if (!$num) {$num=0;}
496	}
497
498	if ($language_exp =~ /en/) {
499	return &convert_gml_to_tokens_EN($text);
500	}
501
502	if ($language_exp =~ /zh/) {
503	return &convert_gml_to_tokens_ZH($text);
504	}
505
506	$_ = $text;
507
508	# 1. remove GML tags
509
510	# Remove everything that is in a tag
511	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
512	s/\s<br>\s/ LINEBREAK /isgo;
513	s/<[^>]*>/ /sgo;
514
515	# Now we have the text, but it may contain HTML
516	# elements coded as > etc. Remove these tags.
517	s/&/&/sgo;
518	s/</</sgo;
519	s/>/>/sgo;
520	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
521	s/\s<br>\s/ LINEBREAK /isgo;
522	s/<[^>]*>/ /sgo;
523
524	# replace<p> and <br> placeholders with clause break symbol (\n)
525	s/\s+/ /gso;
526	s/PARAGRAPHBREAK/\n/sgo;
527	s/LINEBREAK/\n/sgo;
528
529
530
531
532	# 2. Split the remaining text into space-delimited tokens
533
534	# Convert any HTML special characters (like ") to their UTF8 equivalent
535	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
536
537	# Split text at word boundaries
538	s/\b/ /go;
539
540	# 3. Convert the remaining text to "clause format"
541
542	# Insert newline if the end of a sentence is detected
543	# (delimter is: "[\.\?\!]\s")
544	# s/\s*[\.\?\!]\s+/\n/go;
545
546	# remove unnecessary punctuation and replace with clause break symbol (\n)
547	s/[^\w ]/\n/go;
548
549	# remove extraneous whitespace
550	s/ +/ /sgo;
551	s/^\s+//mgo;
552	s/\s*$/\n/mgo;
553
554	# remove lines that contain one word or less
555	s/^\S*$//mgo;
556	s/^\s*$//mgo;
557	tr/\n//s;
558
559	return $_;
560	}
561
562	# a chinese version
563	sub convert_gml_to_tokens_ZH {
564
565	$_ = shift @_;
566
567	# Replace all whitespace with a simple space
568	s/\s+/ /gs;
569	# Remove everything that is in a tag
570	s/\s<p>\s/ PARAGRAPHBREAK /isg;
571	s/\s<br>\s/ LINEBREAK /isg;
572	s/<[^>]*>/ /sg;
573
574	# Now we have the text, but it may contain HTML
575	# elements coded as > etc. Remove these tags.
576	s/</</sg;
577	s/>/>/sg;
578
579	s/\s+/ /sg;
580	s/\s<p>\s/ PARAGRAPHBREAK /isg;
581	s/\s<br>\s/ LINEBREAK /isg;
582	s/<[^>]*>/ /sg;
583
584	# remove & and other miscellaneous markup tags
585	s/&/&/sg;
586	s/</</sg;
587	s/>/>/sg;
588	s/&/&/sg;
589
590	# replace<p> and <br> placeholders with carriage returns
591	s/PARAGRAPHBREAK/\n/sg;
592	s/LINEBREAK/\n/sg;
593
594
595	# print STDERR "text:$_\n";
596	return $_;
597	}
598
599	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
600
601	sub convert_gml_to_tokens_EN {
602	$_ = shift @_;
603
604	# FIRST, remove GML tags
605
606	# Replace all whitespace with a simple space
607	s/\s+/ /gs;
608
609	# Remove everything that is in a tag
610	s/\s<p>\s/ PARAGRAPHBREAK /isg;
611	s/\s<br>\s/ LINEBREAK /isg;
612	s/<[^>]*>/ /sg;
613
614	# Now we have the text, but it may contain HTML
615	# elements coded as > etc. Remove these tags.
616	s/</</sg;
617	s/>/>/sg;
618
619	s/\s+/ /sg;
620	s/\s<p>\s/ PARAGRAPHBREAK /isg;
621	s/\s<br>\s/ LINEBREAK /isg;
622	s/<[^>]*>/ /sg;
623
624	# remove & and other miscellaneous markup tags
625	s/&/&/sg;
626	s/</</sg;
627	s/>/>/sg;
628	s/&/&/sg;
629
630	# replace<p> and <br> placeholders with carriage returns
631	s/PARAGRAPHBREAK/\n/sg;
632	s/LINEBREAK/\n/sg;
633
634
635	# Exceptional punctuation
636	#
637	# We make special cases of some punctuation
638
639	# remove any apostrophe that indicates omitted letters
640	s/(\w+)\'(\w*\s)/ $1$2 /g;
641
642	# remove period that appears in a person's initals
643	s/\s([A-Z])\./ $1 /g;
644
645	# replace hyphens in hypheanted words and names with a space
646	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
647
648	# Convert the remaining text to "clause format",
649	# This means removing all excess punctuation and garbage text,
650	# normalising valid punctuation to fullstops and commas,
651	# then putting one cluse on each line.
652
653	# Insert newline when the end of a sentence is detected
654	# (delimter is: "[\.\?\!]\s")
655	s/\s*[\.\?\!]\s+/\n/g;
656
657	# split numbers after four digits
658	s/(\d\d\d\d)/$1 /g;
659
660	# split words after 32 characters
661
662	# squash repeated punctuation
663	tr/A-Za-z0-9 //cs;
664
665	# save email addresses
666	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
667
668	# normalise clause breaks (mostly punctuation symbols) to commas
669	s/[^A-Za-z0-9 \n]+/ , /g;
670
671	# Remove repeated commas, and replace with newline
672	s/\s*,[, ]+/\n/g;
673
674	# remove extra whitespace
675	s/ +/ /sg;
676	s/^\s+//mg;
677	s/\s*$/\n/mg;
678
679	# remove lines that contain one word or less
680	s/^\w*$//mg;
681	s/^\s*$//mg;
682	tr/\n//s;
683
684	return $_;
685
686	}
687
688
689
690	# Execute a system command
691
692	sub execute {
693	my ($command, $verbosity, $outhandle) = @_;
694	print $outhandle "Executing: $command\n" if ($verbosity > 2);
695	$! = 0;
696	my $status = system($command);
697	if ($status != 0) {
698	print STDERR "Phind - Error executing '$command': $!\n";
699	exit($status); # this causes the build to fail...
700	}
701	}
702
703
704	# Generate the vocabulary, symbol statistics, and numbers file from the
705	# clauses file. This is legacy code, so is a bit messy and probably wont
706	# run under windows.
707
708	sub extract_vocabulary {
709	my ($self) = @_;
710
711	my $verbosity = $self->{'verbosity'};
712	my $out = $self->{'outhandle'};
713
714	my $collectiondir = $self->{'collectiondir'};
715	my $phinddir = $self->{'phinddir'};
716
717	my $language_exp = $self->{'language_exp'};
718
719	my ($w, $l, $line, $word);
720
721	my ($first_delimiter, $last_delimiter,
722	$first_stopword, $last_stopword,
723	$first_extractword, $last_extractword,
724	$first_contentword, $last_contentword,
725	$phrasedelimiter);
726
727	my $thesaurus = $self->{'thesaurus'};
728	my ($thesaurus_links, $thesaurus_terms,
729	%thesaurus, $first_thesaurusword, $last_thesaurusword);
730
731	my %symbol;
732	my (%freq);
733
734	print $out "Calculating vocabulary\n" if ($verbosity > 1);
735
736	# Read and store the stopwords
737	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
738	my $stopword_files = ();
739	my ($language, $language_dir, $file, $file_name);
740	my %stopwords;
741
742	# Examine each directory in the stopword directory
743	opendir(STOPDIR, $stopdir);
744	foreach $language (readdir STOPDIR) {
745
746	# Ignore entries that do not match the classifier's language
747	next unless ($language =~ /$language_exp/);
748	$language_dir = &util::filename_cat($stopdir, $language);
749	next unless (-d "$language_dir");
750
751	opendir(LANGDIR, $language_dir);
752	foreach $file (readdir LANGDIR) {
753
754	# Ignore entries that are not stopword files
755	next unless ($file =~ /sw$/);
756	$file_name = &util::filename_cat($language_dir, $file);
757	next unless (-f "$file_name");
758
759	# Read the stopwords
760	open(STOPFILE, "<$file_name");
761	while (<STOPFILE>) {
762	s/^\s+//;
763	s/\s.*//;
764	$word = $_;
765	$l = lc($word);
766	$stopwords{$l} = $word;
767	}
768	close STOPFILE;
769
770	}
771	closedir LANGDIR;
772	}
773	closedir STOPDIR;
774
775	# Read thesaurus information
776	if ($thesaurus) {
777
778	# link file exists
779	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
780	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
781
782	# ensure term file exists in the correct language
783	if ($language_exp =~ /^([a-z][a-z])/) {
784	$language = $1;
785	} else {
786	$language = 'en';
787	}
788	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
789	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
790
791
792	# Read the thesaurus terms
793	open(TH, "<$thesaurus_terms");
794	while(<TH>) {
795	s/^\d+ //;
796	s/\(.*\)//;
797	foreach $w (split(/\s+/, $_)) {
798	$thesaurus{lc($w)} = $w;
799	}
800	}
801	close TH;
802	}
803
804	# Read words in the text and count occurences
805	open(TXT, "<$phinddir/clauses");
806
807	my @words;
808	while(<TXT>) {
809	$line = $_;
810	next unless ($line =~ /./);
811
812	@words = split(/\s+/, $line);
813	foreach $w (@words) {
814	$l = lc($w);
815	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
816	$freq{$w}++;
817	}
818	$freq{$senlimit}++;
819	}
820
821	close TXT;
822
823	# Calculate the "best" form of each word
824	my (%bestform, %totalfreq, %bestfreq);
825
826	foreach $w (sort (keys %freq)) {
827	$l = lc($w);
828
829	# totalfreq is the number of times a term appears in any form
830	$totalfreq{$l} += $freq{$w};
831
832	if (defined $stopwords{$l}) {
833	$bestform{$l} = $stopwords{$l};
834
835	} elsif (defined $thesaurus{$l}) {
836	$bestform{$l} = $thesaurus{$l};
837
838	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
839	$bestfreq{$l} = $freq{$w};
840	$bestform{$l} = $w;
841	}
842	}
843	undef %freq;
844	undef %bestfreq;
845
846
847	# Assign symbol numbers to tokens
848	my $nextsymbol = 1;
849	my (@vocab);
850
851	# Delimiters
852	$first_delimiter = 1;
853
854	foreach $word (@delimiters) {
855
856	# $word = lc($word); # jrm21
857	$word = uc($word);
858	$bestform{$word} = $word;
859	$vocab[$nextsymbol] = $word;
860	$symbol{$word} = $nextsymbol;
861	$nextsymbol++;
862	}
863	$last_delimiter = $nextsymbol - 1;
864	# Stopwords
865	$first_stopword = $nextsymbol;
866
867	foreach my $word (sort keys %stopwords) {
868	# don't include stopword unless it occurs in the text
869	$word = lc($word);
870	next unless ($totalfreq{$word});
871	next if ($symbol{$word});
872
873	$vocab[$nextsymbol] = $word;
874	$symbol{$word} = $nextsymbol;
875	$nextsymbol++;
876	}
877	$last_stopword = $nextsymbol - 1;
878	$first_contentword = $nextsymbol;
879
880	# Thesaurus terms
881	if ($thesaurus) {
882	$first_thesaurusword = $nextsymbol;
883
884	foreach my $word (sort keys %thesaurus) {
885
886	$word = lc($word);
887	next if ($symbol{$word});
888	$bestform{$word} = $thesaurus{$word};
889
890	$vocab[$nextsymbol] = $word;
891	$symbol{$word} = $nextsymbol;
892	$nextsymbol++;
893
894	}
895	$last_thesaurusword = $nextsymbol - 1;
896	}
897
898	# Other content words
899	$first_extractword = $nextsymbol;
900
901	foreach my $word (sort (keys %bestform)) {
902
903	next if ($symbol{$word});
904
905	$vocab[$nextsymbol] = $word;
906	$symbol{$word} = $nextsymbol;
907	$nextsymbol++;
908	}
909	$last_extractword = $nextsymbol - 1;
910	$last_contentword = $nextsymbol - 1;
911
912	# Outut the words
913	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
914	open(VOC, ">$phinddir/clauses.vocab");
915
916	for (my $i = 1; $i < $nextsymbol; $i++) {
917	$w = $vocab[$i];
918
919	print VOC "$bestform{$w}\n";
920	$totalfreq{$w} = 0 unless ($totalfreq{$w});
921	}
922	close VOC;
923
924
925	# Create statistics file
926	# Output statistics about the vocablary
927	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
928	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
929
930	open(STAT, ">$phinddir/clauses.stats")
931	\|\| die "Cannot open $phinddir/clauses.stats: $!";
932
933	print STAT "first_delimiter $first_delimiter\n";
934	print STAT "last_delimiter $last_delimiter\n";
935	print STAT "first_stopword $first_stopword\n";
936	print STAT "last_stopword $last_stopword\n";
937	if ($thesaurus) {
938	print STAT "first_thesaurusword $first_thesaurusword\n";
939	print STAT "last_thesaurusword $last_thesaurusword\n";
940	}
941	print STAT "first_extractword $first_extractword\n";
942	print STAT "last_extractword $last_extractword\n";
943	print STAT "first_contentword $first_contentword\n";
944	print STAT "last_contentword $last_contentword\n";
945	print STAT "first_symbol $first_delimiter\n";
946	print STAT "last_symbol $last_contentword\n";
947	print STAT "first_word $first_stopword\n";
948	print STAT "last_word $last_contentword\n";
949	close STAT;
950
951	undef @vocab;
952
953
954	# Create numbers file
955	# Save text as symbol numbers
956	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
957
958	open(TXT, "<$phinddir/clauses");
959	open(NUM, ">$phinddir/clauses.numbers");
960
961	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
962	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
963	$phrasedelimiter = $symbol{$senlimit};
964	print NUM "$symbol{$colstart}\n";
965
966	# set up the special symbols that delimit documents and sentences
967	while(<TXT>) {
968
969	# split sentence into a list of tokens
970	$line = $_;
971	next unless ($line =~ /./);
972	@words = split(/\s+/, $line);
973
974	# output one token at a time
975	foreach $word (@words) {
976	# don't lower-case special delimiters - jrm21
977	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
978	$word = lc($word);
979	}
980	print NUM "$symbol{$word}\n";
981	}
982
983	# output phrase delimiter
984	print NUM "$phrasedelimiter\n";
985	}
986
987	close TXT;
988	# print NUM "$symbol{lc($colend)}\n";# jrm21
989	print NUM "$symbol{$colend}\n";
990	close NUM;
991
992	# Save thesaurus data in one convienient file
993	if ($thesaurus) {
994
995	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
996
997
998	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
999	if ($verbosity > 1);
1000
1001	# Read the thesaurus terms
1002	my ($num, $text, %thes_symbols);
1003
1004	open(TH, "<$thesaurus_terms");
1005	while(<TH>) {
1006	chomp;
1007	@words = split(/\s+/, $_);
1008	$num = shift @words;
1009	$text = "";
1010
1011	# translate words into symbol numbers
1012	foreach $word (@words) {
1013	$word = lc($word);
1014	if ($symbol{$word}) {
1015	$text .= "s$symbol{$word} ";
1016	} elsif ($verbosity) {
1017	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1018	}
1019	}
1020	$text =~ s/ $//;
1021	$thes_symbols{$num} = $text;
1022	}
1023	close TH;
1024
1025	# Read the thesaurus links and write the corresponding data
1026	open(TH, "<$thesaurus_links");
1027	open(THOUT, ">$thesaurusfile");
1028
1029	while(<TH>) {
1030	chomp;
1031	($num, $text) = split(/:/, $_);
1032
1033	if (defined($thes_symbols{$num})) {
1034	print THOUT "$num:$thes_symbols{$num}:$text\n";
1035	} else {
1036	print THOUT "$num:untranslated:$text\n";
1037	}
1038	}
1039	close TH;
1040	close THOUT;
1041	}
1042
1043
1044
1045
1046	}
1047
1048
1049	# renumber_phrases
1050	#
1051	# Prepare the phrases file to be input to mgpp. The biggest problem is
1052	# reconciling the phrase identifiers used by the suffix program (which
1053	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1054	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1055	# Phind-id numbers must be sorted by frequency of occurance.
1056	#
1057	# Start creating a set of phind-id numbers from the sorted suffix-id
1058	# numbers and (if required) the thesaurus-id numbers. Then add any other
1059	# phrases occuring in the thesaurus.
1060	#
1061	# The last thing we have to do is restore the vocabulary information to the
1062	# phrase file so that the phrases are stored as words, not as symbol
1063	# numbers.
1064
1065	# The original phrases file looks something like this:
1066	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1067	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1068	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1069	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1070
1071
1072	sub renumber_phrases {
1073	my ($self) = @_;
1074
1075	renumber_suffix_data($self);
1076	renumber_thesaurus_data($self);
1077	restore_vocabulary_data($self);
1078
1079	}
1080
1081
1082
1083	# renumber_suffix_data
1084	#
1085	# Translate phrases file to phrases.2 using phind keys instead
1086	# of suffix keys and sorting the expansion data.
1087
1088	sub renumber_suffix_data {
1089	my ($self) = @_;
1090
1091	my $verbosity = $self->{'verbosity'};
1092	my $out = $self->{'outhandle'};
1093	print $out "Translate phrases: suffix-ids become phind-id's\n"
1094	if ($verbosity);
1095
1096	my $phinddir = $self->{'phinddir'};
1097	my $infile = &util::filename_cat($phinddir, 'phrases');
1098	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1099
1100	# Read the phrase file. Calculate initial set of phind-id
1101	# numbers and store (suffixid -> frequency) relation.
1102
1103	my %suffixtophind;
1104	my @phindfrequency;
1105	my (@fields, $suffixid);
1106	my $nextphind = 1;
1107
1108	open(IN, "<$infile");
1109	while(<IN>) {
1110
1111	chomp;
1112	@fields = split(/:/, $_);
1113
1114	# get next suffixid and phindid
1115	$suffixid = shift @fields;
1116	$suffixtophind{$suffixid} = $nextphind;
1117
1118	# store total frequency
1119	shift @fields;
1120	$totalfrequency[$nextphind] = shift @fields;
1121
1122	$nextphind++;
1123	}
1124	close IN;
1125
1126
1127	# Translate phrases file to phrases.2. Use phind keys (not suffix
1128	# keys), sort expansion and document occurance data in order of
1129	# descending frequency..
1130	open(IN, "<$infile");
1131	open(OUT, ">$outfile");
1132
1133	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1134	my (@documwents, @newexp, $k, $n);
1135	my $linenumber = 0;
1136
1137	while(<IN>) {
1138
1139	# read the line
1140	chomp;
1141	@fields = split(/:/, $_);
1142
1143	# get a phrase number for this line
1144	$suffixid = shift @fields;
1145	die unless (defined($suffixtophind{$suffixid}));
1146	$phindid = $suffixtophind{$suffixid};
1147
1148	# get the symbols in the phrase
1149	$text = shift @fields;
1150
1151	# output status information
1152	$linenumber++;
1153	if ($verbosity > 2) {
1154	if ($linenumber % 1000 == 0) {
1155	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1156	}
1157	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1158	}
1159
1160	# get the phrase frequency
1161	$tf = shift @fields;
1162
1163	# get the number of expansions
1164	$countexp = shift @fields;
1165
1166	# get the expansions, convert them into phind-id numbers, and sort them
1167	$expansions = shift @fields;
1168	@newexp = ();
1169	foreach $k (split(/,/, $expansions)) {
1170	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1171	$n = $suffixtophind{$k};
1172	push @newexp, $n;
1173	}
1174	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1175
1176	# get the number of documents
1177	$countdocs = shift @fields;
1178
1179	# get the documents and sort them
1180	$documents = shift @fields;
1181	$documents =~ s/d//g;
1182	@documents = split(/;/, $documents);
1183	@documents = sort by_doc_frequency @documents;
1184
1185	# output the phrase data
1186	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1187	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1188
1189	}
1190
1191	close IN;
1192	close OUT;
1193	}
1194
1195
1196	# renumber_thesaurus_data
1197	#
1198	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1199
1200	sub renumber_thesaurus_data {
1201	my ($self) = @_;
1202
1203	my $out = $self->{'outhandle'};
1204	my $verbosity = $self->{'verbosity'};
1205	my $thesaurus = $self->{'thesaurus'};
1206
1207	my $phinddir = $self->{'phinddir'};
1208	my $infile = &util::filename_cat($phinddir, "phrases.2");
1209	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1210
1211
1212	# If no thesaurus is defined, simply move the phrases file.
1213	if (!$thesaurus) {
1214	print $out "Translate phrases.2: no thesaurus data\n"
1215	if ($verbosity);
1216	&util::mv($infile, $outfile);
1217	return;
1218	}
1219
1220	print $out "Translate phrases.2: add thesaurus data\n"
1221	if ($verbosity);
1222
1223	# 1.
1224	# Read thesaurus file and store (symbols->thesaurusid) mapping
1225	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1226	my %symbolstothesid;
1227	my (@fields, $thesid, $symbols);
1228
1229	open(TH, "<$thesaurusfile");
1230
1231	while (<TH>) {
1232
1233	chomp;
1234	@fields = split(/:/, $_);
1235
1236	# get id and text
1237	$thesid = shift @fields;
1238	$symbols = shift @fields;
1239	$symbolstothesid{$symbols} = $thesid;
1240	}
1241	close TH;
1242
1243	# 2.
1244	# Read phrases file to find thesaurus entries that already
1245	# have a phindid. Store their phind-ids for later translation,
1246	# and store their frequency for later sorting.
1247	my %thesaurustophindid;
1248	my %phindidtofrequency;
1249	my ($phindid, $freq);
1250
1251	open(IN, "<$infile");
1252
1253	while(<IN>) {
1254
1255	chomp;
1256	@fields = split(/:/, $_);
1257
1258	# phindid and symbols for this line
1259	$phindid = shift @fields;
1260	$symbols = shift @fields;
1261	$freq = shift @fields;
1262
1263	# do we have a thesaurus id corresponding to this phrase?
1264	if (defined($symbolstothesid{$symbols})) {
1265	$thesid = $symbolstothesid{$symbols};
1266	$thesaurustophindid{$thesid} = $phindid;
1267	$phindidtofrequency{$phindid} = $freq;
1268	}
1269	}
1270	close IN;
1271
1272	undef %symbolstothesid;
1273
1274	# 3.
1275	# Create phind-id numbers for remaining thesaurus entries,
1276	# and note that their frequency is 0 for later sorting.
1277	my $nextphindid = $phindid + 1;
1278
1279	open(TH, "<$thesaurusfile");
1280	while(<TH>) {
1281
1282	chomp;
1283	@fields = split(/:/, $_);
1284
1285	# read thesaurus-id and ensure it has a corresponding phind-id
1286	$thesid = shift @fields;
1287	if (!defined($thesaurustophindid{$thesid})) {
1288	$thesaurustophindid{$thesid} = $nextphindid;
1289	$phindidtofrequency{$nextphindid} = 0;
1290	$nextphindid++;
1291	}
1292	}
1293	close TH;
1294
1295	# 4.
1296	# Translate thesaurus file, replacing thesaurus-id numbers with
1297	# phind-id numbers.
1298	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1299	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1300	my (@links, $linkid, %linkidtotype, $newrelation);
1301
1302	open(TH, "<$thesaurusfile");
1303	open(TO, ">$newthesaurusfile");
1304	while(<TH>) {
1305
1306	chomp;
1307	@fields = split(/:/, $_);
1308
1309	# phindid and symbols for this line
1310	($thesid, $symbols, $relations) = @fields;
1311
1312	die unless ($thesid && $symbols);
1313	die unless $thesaurustophindid{$thesid};
1314	$phindid = $thesaurustophindid{$thesid};
1315
1316	# convert each part of the relation string to use phind-id numbers
1317	# at the same time, we want to sort the list by frequency.
1318	undef %linkidtotype;
1319
1320	foreach $linktext (split(/;/, $relations)) {
1321	@linkdata = split(/,/, $linktext);
1322
1323	# remember the linktype (e.g. BT, NT)
1324	$linktype = shift @linkdata;
1325
1326	# store the type of each link
1327	foreach $thesid (@linkdata) {
1328	die unless (defined($thesaurustophindid{$thesid}));
1329	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1330	}
1331	}
1332
1333	# sort the list of links, first by frequency, then by type.
1334	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1335	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1336	$linkcounter = (scalar @links);
1337
1338	# create a string describing the link information
1339	$linktype = $linkidtotype{$links[0]};
1340	$newrelation = $linktype;
1341	foreach $linkid (@links) {
1342	if ($linkidtotype{$linkid} ne $linktype) {
1343	$linktype = $linkidtotype{$linkid};
1344	$newrelation .= ";" . $linktype;
1345	}
1346	$newrelation .= "," . $linkid;
1347	}
1348	$newrelation .= ";";
1349
1350
1351	# output the new line
1352	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1353	}
1354	close TH;
1355	close TO;
1356
1357	undef %thesaurustophindid;
1358	undef %linkidtotype;
1359	undef %phindidtofrequency;
1360
1361	# 5.
1362	# Read thesaurus data (in phind-id format) into memory
1363	my %thesaurusdata;
1364
1365	open(TH, "<$newthesaurusfile");
1366	while(<TH>) {
1367	chomp;
1368	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1369	die unless ($phindid && $symbols);
1370	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1371	}
1372	close TH;
1373
1374	# 6.
1375	# Add thesaurus data to phrases file
1376	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1377	my (@documwents, @newexp, $k, $n);
1378	my $linenumber = 0;
1379
1380	open(IN, "<$infile");
1381	open(OUT, ">$outfile");
1382
1383	# Update existing phrases
1384	while(<IN>) {
1385
1386	chomp;
1387	@fields = split(/:/, $_);
1388
1389	# get data for this line
1390	$phindid = shift @fields;
1391
1392	# output the phrase data, with thesaurus information
1393	print OUT "$phindid:", join(":", @fields);
1394
1395	# add thesaurus data
1396	if (defined($thesaurusdata{$phindid})) {
1397	@fields = split(/:/, $thesaurusdata{$phindid});
1398	shift @fields;
1399	$linkcounter = shift @fields;
1400	$relations = shift @fields;
1401
1402	print OUT ":$linkcounter:$relations";
1403	$thesaurusdata{$phindid} = "";
1404	}
1405	print OUT "\n";
1406	}
1407	close IN;
1408
1409	# Add phrases that aren't already in the file
1410	foreach $phindid (sort numerically keys %thesaurusdata) {
1411	next unless ($thesaurusdata{$phindid});
1412
1413	@fields = split(/:/, $thesaurusdata{$phindid});
1414	$symbols = shift @fields;
1415	$linkcounter = shift @fields;
1416	$relations = shift @fields;
1417
1418	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1419	}
1420	close OUT;
1421
1422	}
1423
1424	# restore_vocabulary_data
1425	#
1426	# Read phrases.3 and restore vocabulary information. Then write
1427	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1428	# (if requested) to the saved phrases file.
1429
1430	sub restore_vocabulary_data {
1431	my ($self) = @_;
1432
1433	my $out = $self->{'outhandle'};
1434	my $verbosity = $self->{'verbosity'};
1435	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1436
1437	my $phinddir = $self->{'phinddir'};
1438	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1439	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1440	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1441	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1442
1443	my $savephrases = $self->{'savephrases'};
1444
1445	# 1.
1446	# Read the vocabulary file
1447	open(V, "<$vocabfile")
1448	\|\| die "Cannot open $vocabfile: $!";
1449	my @symbol;
1450	my $i = 1;
1451	while(<V>) {
1452	chomp;
1453	$symbol[$i++] = $_;
1454	}
1455	close V;
1456
1457	# 2.
1458	# Translate phrases.3 to MGPP input files
1459	my ($key, $text, $word, $isThesaurus);
1460	my @fields;
1461	my $linenumber = 0;
1462
1463	open(IN, "<$infile");
1464	open(DATA, ">$datafile");
1465	open(WORD, ">$wordfile");
1466
1467	# Save the phrases in a separate text file
1468	if ($savephrases) {
1469	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1470	open(SAVE, ">$savephrases");
1471	}
1472
1473	while(<IN>) {
1474
1475	# read the line
1476	chomp;
1477	$line = $_;
1478	@fields = split(/:/, $line);
1479
1480	# get a phrase number for this line
1481	$key = shift @fields;
1482
1483	# restore the text of the phrase
1484	$text = shift @fields;
1485	$text =~ s/s(\d+)/$symbol[$1]/g;
1486	if ($text =~ / /) {
1487	$word = "";
1488	} elsif ($text ne 'untranslated') {
1489	$word = $text;
1490	}
1491
1492	# output the phrase data
1493	print DATA "<Document>";
1494	print DATA "$key:$text:", join(":", @fields), ":\n";
1495
1496	# output the word index search data
1497	print WORD "<Document>$word\n";
1498
1499	# output the phrases to a text file
1500	if ($savephrases) {
1501	if ((scalar @fields) == 7) {
1502	$isThesaurus = 1;
1503	} else {
1504	$isThesaurus = 0;
1505	}
1506	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1507	}
1508	}
1509	close IN;
1510	close WORD;
1511	close DATA;
1512	close SAVE if ($savephrases);
1513
1514	}
1515
1516
1517
1518	# sort routines used to renumber phrases
1519
1520	sub numerically { $a <=> $b }
1521
1522	sub by_doc_frequency {
1523	my $fa = 1;
1524	if ($a =~ /,/) {
1525	$fa = $a;
1526	$fa =~ s/\d+,//;
1527	}
1528	my $fb = 1;
1529	if ($b =~ /,/) {
1530	$fb = $b;
1531	$fb =~ s/\d+,//;
1532	}
1533
1534	return ($fb <=> $fa);
1535	}
1536
1537	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: