Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 8362

Last change on this file since 8362 was 8362, checked in by kjdon, 20 years ago
added a new option to the phind classifier: min_occurs. this is the minimum phrase frequency needed to be included in the hierarchy
Property svn:keywords set to `Author Date Id Revision`
File size: 40.5 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BasClas;
37	use util;
38	use ghtml;
39	use unicode;
40
41	my @removedirs = ();
42
43	my %wanted_index_files = ('td'=>1,
44	't'=>1,
45	'ti'=>1,
46	'tl'=>1,
47	'tsd'=>1,
48	'idb'=>1,
49	'ib1'=>1,
50	'ib2'=>1,
51	'ib3'=>1,
52	'i'=>1,
53	'il'=>1,
54	'w'=>1,
55	'wa'=>1);
56
57	sub BEGIN {
58	@ISA = ('BasClas');
59	}
60
61	sub END {
62
63	# Tidy up stray files - we do this here as there's some weird problem
64	# preventing us from doing it in the get_classify_info() function (on
65	# windows at least) where the close() appears to fail on txthandle and
66	# dochandle, thus preventing us from deleting those files
67
68	foreach my $dir (@removedirs) {
69	if (-d $dir && opendir (DIR, $dir)) {
70	my @files = readdir DIR;
71	closedir DIR;
72
73	foreach $file (@files) {
74	next if $file =~ /^\.\.?$/;
75	my ($suffix) = $file =~ /\.([^\.]+)$/;
76	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
77	# delete it!
78	&util::rm (&util::filename_cat ($dir, $file));
79	}
80	}
81	}
82	}
83	}
84
85	my $arguments =
86	[ { 'name' => "text",
87	'desc' => "{Phind.text}",
88	'type' => "string",
89	'deft' => "section:Title,section:text",
90	'reqd' => "no" },
91	{ 'name' => "title",
92	'desc' => "{Phind.title}",
93	'type' => "metadata",
94	'deft' => "Title",
95	'reqd' => "no" },
96	{ 'name' => "buttonname",
97	'desc' => "{BasClas.buttonname}",
98	'type' => "string",
99	'deft' => "Phrase",
100	'reqd' => "no" },
101	{ 'name' => "language",
102	'desc' => "{Phind.language}",
103	'type' => "string",
104	'deft' => "en",
105	'reqd' => "no" },
106	{ 'name' => "savephrases",
107	'desc' => "{Phind.savephrases}",
108	'type' => "string",
109	'deft' => "",
110	'reqd' => "no" },
111	{ 'name' => "suffixmode",
112	'desc' => "{Phind.suffixmode}",
113	'type' => "int",
114	'deft' => "1",
115	'range' => "0,1",
116	'reqd' => "no" },
117	{ 'name' => "min_occurs",
118	'desc' => "{Phind.min_occurs}",
119	'type' => "int",
120	'deft' => "2",
121	'range' => "1,",
122	'reqd' => "no" },
123	{ 'name' => "thesaurus",
124	'desc' => "{Phind.thesaurus}",
125	'type' => "string",
126	'deft' => "",
127	'reqd' => "no" },
128	{ 'name' => "untidy",
129	'desc' => "{Phind.untidy}",
130	'type' => "flag",
131	'reqd' => "no" } ];
132
133	my $options = { 'name' => "Phind",
134	'desc' => "{Phind.desc}",
135	'abstract' => "no",
136	'inherits' => "yes",
137	'args' => $arguments };
138
139
140	# Phrase delimiter symbols - these should be abstracted out someplace
141
142	my $colstart = "COLLECTIONSTART";
143	my $colend = "COLLECTIONEND";
144	my $doclimit = "DOCUMENTLIMIT";
145	my $senlimit = "SENTENCELIMIT";
146	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
147
148
149	# Create a new Phind browser based on collect.cfg
150
151	sub new {
152	my $class = shift (@_);
153	my $self = new BasClas($class, @_);
154
155	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
156	my $option_list = $self->{'option_list'};
157	push( @{$option_list}, $options );
158
159	if ($self->{'info_only'}) {
160	# created from classinfo.pl - don't need to parse the arguments
161	return bless $self, $class;
162	}
163
164	my $out = $self->{'outhandle'};
165
166	# Ensure the Phind generate scripts are in place
167	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
168	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
169	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
170
171	if (!(-e $file1)) {
172	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173	exit(1);
174	}
175
176	# Parse classifier arguments
177	my $builddir = "";
178	if (!parsargv::parse(\@_,
179	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
180	q^title/.*/Title^, \$self->{'titlefield'},
181	q^buttonname/.*/Phrase^, \$self->{'buttonname'},
182	q^language/.*/en^, \$language,
183	q^savephrases/.*/^, \$self->{'savephrases'},
184	q^suffixmode/\d/1^, \$self->{'suffixmode'},
185	q^min_occurs/\d/2^, \$self->{'min_occurs'},
186	q^thesaurus/.*/^, \$self->{'thesaurus'},
187	q^untidy^, \$self->{'untidy'},
188	"allow_extra_options")) {
189
190	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
191	$self->print_txt_usage(""); # Use default resource bundle
192	die "\n";
193	}
194
195	# classifier information
196	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
197
198	# limit languages
199	$self->{'language_exp'} = $language;
200
201	# collection directories
202	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
203
204	if (! defined $self->{'builddir'}) {
205	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
206	}
207
208	$self->{'total'} = 0;
209
210	return bless $self, $class;
211	}
212
213
214	# Initialise the Phind classifier
215
216	sub init {
217	my $self = shift (@_);
218
219	# ensure we have a build directory
220	my $builddir = $self->{'builddir'};
221	die unless (-e "$builddir");
222
223	# create Phind directory
224	my $phnumber = 1;
225	my $phinddir = &util::filename_cat($builddir, "phind1");
226	while (-e "$phinddir") {
227	$phnumber++;
228	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
229	}
230	&util::mk_dir("$phinddir");
231	$self->{'phinddir'} = $phinddir;
232	$self->{'phindnumber'} = $phnumber;
233
234	push(@removedirs, $phinddir) unless $self->{'untidy'};
235
236	# open filehandles for documents and text
237	my $clausefile = &util::filename_cat("$phinddir", "clauses");
238	&util::rm($clausefile) if (-e $clausefile);
239
240	my $txthandle = 'TEXT' . $phnumber;
241	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
242	$self->{'txthandle'} = $txthandle;
243
244	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
245	&util::rm($docfile) if (-e $docfile);
246
247	my $dochandle = 'DOC' . $phnumber;
248	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
249	$self->{'dochandle'} = $dochandle;
250
251	}
252
253
254	# Classify each document.
255	#
256	# Each document is passed here in turn. The classifier extracts the
257	# text of each and stores it in the clauses file. Document details are
258	# stored in the docs.txt file.
259
260	sub classify {
261	my $self = shift (@_);
262	my $doc_obj = shift @_;
263
264	my $verbosity = $self->{'verbosity'};
265	my $top_section = $doc_obj->get_top_section();
266
267	my $titlefield = $self->{'titlefield'};
268
269	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
270	if (!defined($title)) {
271	$title = "";
272	print STDERR "Phind: document has no title\n";
273	}
274	print "process: $title\n" if ($verbosity > 2);
275
276	# Only consider the file if it is in the correct language
277	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
278	my $phrlanguage = $self->{'language_exp'};
279	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
280
281	# record this file
282	$self->{'total'} ++;
283	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
284
285
286	# Store document details
287	my $OID = $doc_obj->get_OID();
288	$OID = "NULL" unless defined $OID;
289	my $dochandle = $self->{'dochandle'};
290	print $dochandle "<Document>\t$OID\t$title\n";
291
292	# Store the text occuring in this object
293
294	# output the document delimiter
295	my $txthandle = $self->{'txthandle'};
296	print $txthandle "$doclimit\n";
297
298	# iterate over the required indexes and store their text
299	my $indexes = $self->{'indexes'};
300	my $text = "";
301	my ($part, $level, $field, $section, $data, $dataref);
302
303	foreach $part (split(/,/, $indexes)) {
304
305	# Each field has a level and a data element ((e.g. document:Title)
306	($level, $field) = split(/:/, $part);
307	die unless ($level && $field);
308
309	# Extract the text from every section
310	# (In phind, document:text and section:text are equivalent)
311	if ($field eq "text") {
312	$data = "";
313	$section = $doc_obj->get_top_section();
314	while (defined($section)) {
315	$data .= $doc_obj->get_text($section) . "\n";
316	$section = $doc_obj->get_next_section($section);
317	}
318	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
319	}
320
321	# Extract a metadata field from a document
322	# (If there is more than one element of the given type, get them all.)
323	elsif ($level eq "document") {
324	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
325	foreach $data (@$dataref) {
326	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
327	}
328	}
329
330	# Extract metadata from every section in a document
331	elsif ($level eq "section") {
332	$data = "";
333	$section = $doc_obj->get_top_section();
334	while (defined($section)) {
335	$dataref = $doc_obj->get_metadata($section, $field);
336	$data .= join("\n", @$dataref) . "\n";
337	$section = $doc_obj->get_next_section($section);
338	}
339	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
340	}
341
342	# Some sort of specification which I don't understand
343	else {
344	die "Unknown level ($level) in Phind index ($part)\n";
345	}
346
347	}
348
349	# output the text
350	$text =~ tr/\n//s;
351	print $txthandle "$text";
352	}
353
354
355	# Construct the classifier from the information already gathered
356	#
357	# When get_classify_info is called, the clauses and docs.txt files have
358	# already been constructed in the Phind directory. This function will
359	# translate them into compressed, indexed MGPP files that can be read by
360	# the phindcgi script. It will also register our classifier so that it
361	# shows up in the navigation bar.
362
363	sub get_classify_info {
364	my $self = shift (@_);
365	my ($gli) = @_;
366
367	close $self->{'dochandle'};
368	close $self->{'txthandle'};
369	my $verbosity = $self->{'verbosity'};
370	my $out = $self->{'outhandle'};
371	my $phinddir = $self->{'phinddir'};
372
373	my $osextra = "";
374	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
375	$osextra = " -d /";
376	}
377
378	print STDERR "</Stage>\n" if $gli;
379
380	if ($verbosity) {
381	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
382	print $out "*** in ", $self->{'phinddir'}, "\n";
383	}
384
385	print STDERR "<Stage name='Phind'>\n" if $gli;
386
387	# Construct phind indexes
388	my $suffixmode = $self->{'suffixmode'};
389	my $min_occurs = $self->{'min_occurs'};
390	my ($command, $status);
391
392	# Generate the vocabulary, symbol statistics, and numbers file
393	# from the clauses file
394	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
395	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
396	&extract_vocabulary($self);
397
398	# Use the suffix program to generate the phind/phrases file
399	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
400	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
401	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
402
403	# check that we generated some files. It's not necessarily an error if
404	# we didn't (execute() would have quit on error), but we can't go on.
405	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
406	if (! -r $phrasesfile) {
407	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
408	print $out "\nNo phrases found for Phind classifier!\n";
409	return;
410	}
411
412	# Create the phrase file and put phrase numbers in phind/phrases
413	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
414	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
415	&renumber_phrases($self);
416
417	print $out "\nCreating phrase databases\n";
418	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
419	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
420	my $mg_stem = &util::filename_cat($phinddir, "pdata");
421
422	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
423	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
424	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
425
426	# create the mg index of words
427	print $out "\nCreating word-level search indexes\n";
428	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
429	$mg_input = &util::filename_cat($phinddir, "pword.txt");
430	$mg_stem = &util::filename_cat($phinddir, "pword");
431
432	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
433	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
434	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
435	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
436	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
437	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
438
439	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
440	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
441	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
442
443	# create the mg document information database
444	print $out "\nCreating document information databases\n";
445	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
446	$mg_input = &util::filename_cat($phinddir, "docs.txt");
447	$mg_stem = &util::filename_cat($phinddir, "docs");
448
449	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
450	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
451	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
452
453	# Return the information about the classifier that we'll later want to
454	# use to create macros when the Phind classifier document is displayed.
455	my %classifyinfo = ('thistype'=>'Invisible',
456	'childtype'=>'Phind',
457	'Title'=>$self->{'buttonname'},
458	'parameters'=>"phindnumber=$self->{'phindnumber'}",
459	'contains'=>[]);
460
461	my $collection = $self->{'collection'};
462	my $url = "library?a=p&p=phind&c=$collection";
463	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
464
465	return \%classifyinfo;
466	}
467
468
469
470	sub convert_gml_to_tokens {
471
472	my ($language_exp, $text) = @_;
473
474	# escape any magic words... - jrm21
475	foreach my $delim (@delimiters) {
476	my $replacement=lc($delim);
477	my $num= $text=~ s/$delim/$replacement/g;
478	if (!$num) {$num=0;}
479	}
480
481	if ($language_exp =~ /en/) {
482	return &convert_gml_to_tokens_EN($text);
483	}
484
485	if ($language_exp =~ /zh/) {
486	return &convert_gml_to_tokens_ZH($text);
487	}
488
489	$_ = $text;
490
491	# 1. remove GML tags
492
493	# Remove everything that is in a tag
494	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
495	s/\s<br>\s/ LINEBREAK /isgo;
496	s/<[^>]*>/ /sgo;
497
498	# Now we have the text, but it may contain HTML
499	# elements coded as > etc. Remove these tags.
500	s/&/&/sgo;
501	s/</</sgo;
502	s/>/>/sgo;
503	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
504	s/\s<br>\s/ LINEBREAK /isgo;
505	s/<[^>]*>/ /sgo;
506
507	# replace<p> and <br> placeholders with clause break symbol (\n)
508	s/\s+/ /gso;
509	s/PARAGRAPHBREAK/\n/sgo;
510	s/LINEBREAK/\n/sgo;
511
512
513
514
515	# 2. Split the remaining text into space-delimited tokens
516
517	# Convert any HTML special characters (like ") to their UTF8 equivalent
518	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
519
520	# Split text at word boundaries
521	s/\b/ /go;
522
523	# 3. Convert the remaining text to "clause format"
524
525	# Insert newline if the end of a sentence is detected
526	# (delimter is: "[\.\?\!]\s")
527	# s/\s*[\.\?\!]\s+/\n/go;
528
529	# remove unnecessary punctuation and replace with clause break symbol (\n)
530	s/[^\w ]/\n/go;
531
532	# remove extraneous whitespace
533	s/ +/ /sgo;
534	s/^\s+//mgo;
535	s/\s*$/\n/mgo;
536
537	# remove lines that contain one word or less
538	s/^\S*$//mgo;
539	s/^\s*$//mgo;
540	tr/\n//s;
541
542	return $_;
543	}
544
545	# a chinese version
546	sub convert_gml_to_tokens_ZH {
547
548	$_ = shift @_;
549
550	# Replace all whitespace with a simple space
551	s/\s+/ /gs;
552	# Remove everything that is in a tag
553	s/\s<p>\s/ PARAGRAPHBREAK /isg;
554	s/\s<br>\s/ LINEBREAK /isg;
555	s/<[^>]*>/ /sg;
556
557	# Now we have the text, but it may contain HTML
558	# elements coded as > etc. Remove these tags.
559	s/</</sg;
560	s/>/>/sg;
561
562	s/\s+/ /sg;
563	s/\s<p>\s/ PARAGRAPHBREAK /isg;
564	s/\s<br>\s/ LINEBREAK /isg;
565	s/<[^>]*>/ /sg;
566
567	# remove & and other miscellaneous markup tags
568	s/&/&/sg;
569	s/</</sg;
570	s/>/>/sg;
571	s/&/&/sg;
572
573	# replace<p> and <br> placeholders with carriage returns
574	s/PARAGRAPHBREAK/\n/sg;
575	s/LINEBREAK/\n/sg;
576
577
578	# print STDERR "text:$_\n";
579	return $_;
580	}
581
582	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
583
584	sub convert_gml_to_tokens_EN {
585	$_ = shift @_;
586
587	# FIRST, remove GML tags
588
589	# Replace all whitespace with a simple space
590	s/\s+/ /gs;
591
592	# Remove everything that is in a tag
593	s/\s<p>\s/ PARAGRAPHBREAK /isg;
594	s/\s<br>\s/ LINEBREAK /isg;
595	s/<[^>]*>/ /sg;
596
597	# Now we have the text, but it may contain HTML
598	# elements coded as > etc. Remove these tags.
599	s/</</sg;
600	s/>/>/sg;
601
602	s/\s+/ /sg;
603	s/\s<p>\s/ PARAGRAPHBREAK /isg;
604	s/\s<br>\s/ LINEBREAK /isg;
605	s/<[^>]*>/ /sg;
606
607	# remove & and other miscellaneous markup tags
608	s/&/&/sg;
609	s/</</sg;
610	s/>/>/sg;
611	s/&/&/sg;
612
613	# replace<p> and <br> placeholders with carriage returns
614	s/PARAGRAPHBREAK/\n/sg;
615	s/LINEBREAK/\n/sg;
616
617
618	# Exceptional punctuation
619	#
620	# We make special cases of some punctuation
621
622	# remove any apostrophe that indicates omitted letters
623	s/(\w+)\'(\w*\s)/ $1$2 /g;
624
625	# remove period that appears in a person's initals
626	s/\s([A-Z])\./ $1 /g;
627
628	# replace hyphens in hypheanted words and names with a space
629	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
630
631	# Convert the remaining text to "clause format",
632	# This means removing all excess punctuation and garbage text,
633	# normalising valid punctuation to fullstops and commas,
634	# then putting one cluse on each line.
635
636	# Insert newline when the end of a sentence is detected
637	# (delimter is: "[\.\?\!]\s")
638	s/\s*[\.\?\!]\s+/\n/g;
639
640	# split numbers after four digits
641	s/(\d\d\d\d)/$1 /g;
642
643	# split words after 32 characters
644
645	# squash repeated punctuation
646	tr/A-Za-z0-9 //cs;
647
648	# save email addresses
649	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
650
651	# normalise clause breaks (mostly punctuation symbols) to commas
652	s/[^A-Za-z0-9 \n]+/ , /g;
653
654	# Remove repeated commas, and replace with newline
655	s/\s*,[, ]+/\n/g;
656
657	# remove extra whitespace
658	s/ +/ /sg;
659	s/^\s+//mg;
660	s/\s*$/\n/mg;
661
662	# remove lines that contain one word or less
663	s/^\w*$//mg;
664	s/^\s*$//mg;
665	tr/\n//s;
666
667	return $_;
668
669	}
670
671
672
673	# Execute a system command
674
675	sub execute {
676	my ($command, $verbosity, $outhandle) = @_;
677	print $outhandle "Executing: $command\n" if ($verbosity > 2);
678	$! = 0;
679	my $status = system($command);
680	if ($status != 0) {
681	print STDERR "Phind - Error executing '$command': $!\n";
682	exit($status); # this causes the build to fail...
683	}
684	}
685
686
687	# Generate the vocabulary, symbol statistics, and numbers file from the
688	# clauses file. This is legacy code, so is a bit messy and probably wont
689	# run under windows.
690
691	sub extract_vocabulary {
692	my ($self) = @_;
693
694	my $verbosity = $self->{'verbosity'};
695	my $out = $self->{'outhandle'};
696
697	my $collectiondir = $self->{'collectiondir'};
698	my $phinddir = $self->{'phinddir'};
699
700	my $language_exp = $self->{'language_exp'};
701
702	my ($w, $l, $line, $word);
703
704	my ($first_delimiter, $last_delimiter,
705	$first_stopword, $last_stopword,
706	$first_extractword, $last_extractword,
707	$first_contentword, $last_contentword,
708	$phrasedelimiter);
709
710	my $thesaurus = $self->{'thesaurus'};
711	my ($thesaurus_links, $thesaurus_terms,
712	%thesaurus, $first_thesaurusword, $last_thesaurusword);
713
714	my %symbol;
715	my (%freq);
716
717	print $out "Calculating vocabulary\n" if ($verbosity > 1);
718
719	# Read and store the stopwords
720	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
721	my $stopword_files = ();
722	my ($language, $language_dir, $file, $file_name);
723	my %stopwords;
724
725	# Examine each directory in the stopword directory
726	opendir(STOPDIR, $stopdir);
727	foreach $language (readdir STOPDIR) {
728
729	# Ignore entries that do not match the classifier's language
730	next unless ($language =~ /$language_exp/);
731	$language_dir = &util::filename_cat($stopdir, $language);
732	next unless (-d "$language_dir");
733
734	opendir(LANGDIR, $language_dir);
735	foreach $file (readdir LANGDIR) {
736
737	# Ignore entries that are not stopword files
738	next unless ($file =~ /sw$/);
739	$file_name = &util::filename_cat($language_dir, $file);
740	next unless (-f "$file_name");
741
742	# Read the stopwords
743	open(STOPFILE, "<$file_name");
744	while (<STOPFILE>) {
745	s/^\s+//;
746	s/\s.*//;
747	$word = $_;
748	$l = lc($word);
749	$stopwords{$l} = $word;
750	}
751	close STOPFILE;
752
753	}
754	closedir LANGDIR;
755	}
756	closedir STOPDIR;
757
758	# Read thesaurus information
759	if ($thesaurus) {
760
761	# link file exists
762	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
763	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
764
765	# ensure term file exists in the correct language
766	if ($language_exp =~ /^([a-z][a-z])/) {
767	$language = $1;
768	} else {
769	$language = 'en';
770	}
771	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
772	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
773
774
775	# Read the thesaurus terms
776	open(TH, "<$thesaurus_terms");
777	while(<TH>) {
778	s/^\d+ //;
779	s/\(.*\)//;
780	foreach $w (split(/\s+/, $_)) {
781	$thesaurus{lc($w)} = $w;
782	}
783	}
784	close TH;
785	}
786
787	# Read words in the text and count occurences
788	open(TXT, "<$phinddir/clauses");
789
790	my @words;
791	while(<TXT>) {
792	$line = $_;
793	next unless ($line =~ /./);
794
795	@words = split(/\s+/, $line);
796	foreach $w (@words) {
797	$l = lc($w);
798	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
799	$freq{$w}++;
800	}
801	$freq{$senlimit}++;
802	}
803
804	close TXT;
805
806	# Calculate the "best" form of each word
807	my (%bestform, %totalfreq, %bestfreq);
808
809	foreach $w (sort (keys %freq)) {
810	$l = lc($w);
811
812	# totalfreq is the number of times a term appears in any form
813	$totalfreq{$l} += $freq{$w};
814
815	if (defined $stopwords{$l}) {
816	$bestform{$l} = $stopwords{$l};
817
818	} elsif (defined $thesaurus{$l}) {
819	$bestform{$l} = $thesaurus{$l};
820
821	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
822	$bestfreq{$l} = $freq{$w};
823	$bestform{$l} = $w;
824	}
825	}
826	undef %freq;
827	undef %bestfreq;
828
829
830	# Assign symbol numbers to tokens
831	my $nextsymbol = 1;
832	my (@vocab);
833
834	# Delimiters
835	$first_delimiter = 1;
836
837	foreach $word (@delimiters) {
838
839	# $word = lc($word); # jrm21
840	$word = uc($word);
841	$bestform{$word} = $word;
842	$vocab[$nextsymbol] = $word;
843	$symbol{$word} = $nextsymbol;
844	$nextsymbol++;
845	}
846	$last_delimiter = $nextsymbol - 1;
847	# Stopwords
848	$first_stopword = $nextsymbol;
849
850	foreach my $word (sort keys %stopwords) {
851	# don't include stopword unless it occurs in the text
852	$word = lc($word);
853	next unless ($totalfreq{$word});
854	next if ($symbol{$word});
855
856	$vocab[$nextsymbol] = $word;
857	$symbol{$word} = $nextsymbol;
858	$nextsymbol++;
859	}
860	$last_stopword = $nextsymbol - 1;
861	$first_contentword = $nextsymbol;
862
863	# Thesaurus terms
864	if ($thesaurus) {
865	$first_thesaurusword = $nextsymbol;
866
867	foreach my $word (sort keys %thesaurus) {
868
869	$word = lc($word);
870	next if ($symbol{$word});
871	$bestform{$word} = $thesaurus{$word};
872
873	$vocab[$nextsymbol] = $word;
874	$symbol{$word} = $nextsymbol;
875	$nextsymbol++;
876
877	}
878	$last_thesaurusword = $nextsymbol - 1;
879	}
880
881	# Other content words
882	$first_extractword = $nextsymbol;
883
884	foreach my $word (sort (keys %bestform)) {
885
886	next if ($symbol{$word});
887
888	$vocab[$nextsymbol] = $word;
889	$symbol{$word} = $nextsymbol;
890	$nextsymbol++;
891	}
892	$last_extractword = $nextsymbol - 1;
893	$last_contentword = $nextsymbol - 1;
894
895	# Outut the words
896	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
897	open(VOC, ">$phinddir/clauses.vocab");
898
899	for (my $i = 1; $i < $nextsymbol; $i++) {
900	$w = $vocab[$i];
901
902	print VOC "$bestform{$w}\n";
903	$totalfreq{$w} = 0 unless ($totalfreq{$w});
904	}
905	close VOC;
906
907
908	# Create statistics file
909	# Output statistics about the vocablary
910	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
911	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
912
913	open(STAT, ">$phinddir/clauses.stats")
914	\|\| die "Cannot open $phinddir/clauses.stats: $!";
915
916	print STAT "first_delimiter $first_delimiter\n";
917	print STAT "last_delimiter $last_delimiter\n";
918	print STAT "first_stopword $first_stopword\n";
919	print STAT "last_stopword $last_stopword\n";
920	if ($thesaurus) {
921	print STAT "first_thesaurusword $first_thesaurusword\n";
922	print STAT "last_thesaurusword $last_thesaurusword\n";
923	}
924	print STAT "first_extractword $first_extractword\n";
925	print STAT "last_extractword $last_extractword\n";
926	print STAT "first_contentword $first_contentword\n";
927	print STAT "last_contentword $last_contentword\n";
928	print STAT "first_symbol $first_delimiter\n";
929	print STAT "last_symbol $last_contentword\n";
930	print STAT "first_word $first_stopword\n";
931	print STAT "last_word $last_contentword\n";
932	close STAT;
933
934	undef @vocab;
935
936
937	# Create numbers file
938	# Save text as symbol numbers
939	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
940
941	open(TXT, "<$phinddir/clauses");
942	open(NUM, ">$phinddir/clauses.numbers");
943
944	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
945	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
946	$phrasedelimiter = $symbol{$senlimit};
947	print NUM "$symbol{$colstart}\n";
948
949	# set up the special symbols that delimit documents and sentences
950	while(<TXT>) {
951
952	# split sentence into a list of tokens
953	$line = $_;
954	next unless ($line =~ /./);
955	@words = split(/\s+/, $line);
956
957	# output one token at a time
958	foreach $word (@words) {
959	# don't lower-case special delimiters - jrm21
960	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
961	$word = lc($word);
962	}
963	print NUM "$symbol{$word}\n";
964	}
965
966	# output phrase delimiter
967	print NUM "$phrasedelimiter\n";
968	}
969
970	close TXT;
971	# print NUM "$symbol{lc($colend)}\n";# jrm21
972	print NUM "$symbol{$colend}\n";
973	close NUM;
974
975	# Save thesaurus data in one convienient file
976	if ($thesaurus) {
977
978	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
979
980
981	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
982	if ($verbosity > 1);
983
984	# Read the thesaurus terms
985	my ($num, $text, %thes_symbols);
986
987	open(TH, "<$thesaurus_terms");
988	while(<TH>) {
989	chomp;
990	@words = split(/\s+/, $_);
991	$num = shift @words;
992	$text = "";
993
994	# translate words into symbol numbers
995	foreach $word (@words) {
996	$word = lc($word);
997	if ($symbol{$word}) {
998	$text .= "s$symbol{$word} ";
999	} elsif ($verbosity) {
1000	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1001	}
1002	}
1003	$text =~ s/ $//;
1004	$thes_symbols{$num} = $text;
1005	}
1006	close TH;
1007
1008	# Read the thesaurus links and write the corresponding data
1009	open(TH, "<$thesaurus_links");
1010	open(THOUT, ">$thesaurusfile");
1011
1012	while(<TH>) {
1013	chomp;
1014	($num, $text) = split(/:/, $_);
1015
1016	if (defined($thes_symbols{$num})) {
1017	print THOUT "$num:$thes_symbols{$num}:$text\n";
1018	} else {
1019	print THOUT "$num:untranslated:$text\n";
1020	}
1021	}
1022	close TH;
1023	close THOUT;
1024	}
1025
1026
1027
1028
1029	}
1030
1031
1032	# renumber_phrases
1033	#
1034	# Prepare the phrases file to be input to mgpp. The biggest problem is
1035	# reconciling the phrase identifiers used by the suffix program (which
1036	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1037	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1038	# Phind-id numbers must be sorted by frequency of occurance.
1039	#
1040	# Start creating a set of phind-id numbers from the sorted suffix-id
1041	# numbers and (if required) the thesaurus-id numbers. Then add any other
1042	# phrases occuring in the thesaurus.
1043	#
1044	# The last thing we have to do is restore the vocabulary information to the
1045	# phrase file so that the phrases are stored as words, not as symbol
1046	# numbers.
1047
1048	# The original phrases file looks something like this:
1049	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1050	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1051	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1052	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1053
1054
1055	sub renumber_phrases {
1056	my ($self) = @_;
1057
1058	renumber_suffix_data($self);
1059	renumber_thesaurus_data($self);
1060	restore_vocabulary_data($self);
1061
1062	}
1063
1064
1065
1066	# renumber_suffix_data
1067	#
1068	# Translate phrases file to phrases.2 using phind keys instead
1069	# of suffix keys and sorting the expansion data.
1070
1071	sub renumber_suffix_data {
1072	my ($self) = @_;
1073
1074	my $verbosity = $self->{'verbosity'};
1075	my $out = $self->{'outhandle'};
1076	print $out "Translate phrases: suffix-ids become phind-id's\n"
1077	if ($verbosity);
1078
1079	my $phinddir = $self->{'phinddir'};
1080	my $infile = &util::filename_cat($phinddir, 'phrases');
1081	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1082
1083	# Read the phrase file. Calculate initial set of phind-id
1084	# numbers and store (suffixid -> frequency) relation.
1085
1086	my %suffixtophind;
1087	my @phindfrequency;
1088	my (@fields, $suffixid);
1089	my $nextphind = 1;
1090
1091	open(IN, "<$infile");
1092	while(<IN>) {
1093
1094	chomp;
1095	@fields = split(/:/, $_);
1096
1097	# get next suffixid and phindid
1098	$suffixid = shift @fields;
1099	$suffixtophind{$suffixid} = $nextphind;
1100
1101	# store total frequency
1102	shift @fields;
1103	$totalfrequency[$nextphind] = shift @fields;
1104
1105	$nextphind++;
1106	}
1107	close IN;
1108
1109
1110	# Translate phrases file to phrases.2. Use phind keys (not suffix
1111	# keys), sort expansion and document occurance data in order of
1112	# descending frequency..
1113	open(IN, "<$infile");
1114	open(OUT, ">$outfile");
1115
1116	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1117	my (@documwents, @newexp, $k, $n);
1118	my $linenumber = 0;
1119
1120	while(<IN>) {
1121
1122	# read the line
1123	chomp;
1124	@fields = split(/:/, $_);
1125
1126	# get a phrase number for this line
1127	$suffixid = shift @fields;
1128	die unless (defined($suffixtophind{$suffixid}));
1129	$phindid = $suffixtophind{$suffixid};
1130
1131	# get the symbols in the phrase
1132	$text = shift @fields;
1133
1134	# output status information
1135	$linenumber++;
1136	if ($verbosity > 2) {
1137	if ($linenumber % 1000 == 0) {
1138	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1139	}
1140	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1141	}
1142
1143	# get the phrase frequency
1144	$tf = shift @fields;
1145
1146	# get the number of expansions
1147	$countexp = shift @fields;
1148
1149	# get the expansions, convert them into phind-id numbers, and sort them
1150	$expansions = shift @fields;
1151	@newexp = ();
1152	foreach $k (split(/,/, $expansions)) {
1153	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1154	$n = $suffixtophind{$k};
1155	push @newexp, $n;
1156	}
1157	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1158
1159	# get the number of documents
1160	$countdocs = shift @fields;
1161
1162	# get the documents and sort them
1163	$documents = shift @fields;
1164	$documents =~ s/d//g;
1165	@documents = split(/;/, $documents);
1166	@documents = sort by_doc_frequency @documents;
1167
1168	# output the phrase data
1169	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1170	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1171
1172	}
1173
1174	close IN;
1175	close OUT;
1176	}
1177
1178
1179	# renumber_thesaurus_data
1180	#
1181	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1182
1183	sub renumber_thesaurus_data {
1184	my ($self) = @_;
1185
1186	my $out = $self->{'outhandle'};
1187	my $verbosity = $self->{'verbosity'};
1188	my $thesaurus = $self->{'thesaurus'};
1189
1190	my $phinddir = $self->{'phinddir'};
1191	my $infile = &util::filename_cat($phinddir, "phrases.2");
1192	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1193
1194
1195	# If no thesaurus is defined, simply move the phrases file.
1196	if (!$thesaurus) {
1197	print $out "Translate phrases.2: no thesaurus data\n"
1198	if ($verbosity);
1199	&util::mv($infile, $outfile);
1200	return;
1201	}
1202
1203	print $out "Translate phrases.2: add thesaurus data\n"
1204	if ($verbosity);
1205
1206	# 1.
1207	# Read thesaurus file and store (symbols->thesaurusid) mapping
1208	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1209	my %symbolstothesid;
1210	my (@fields, $thesid, $symbols);
1211
1212	open(TH, "<$thesaurusfile");
1213
1214	while (<TH>) {
1215
1216	chomp;
1217	@fields = split(/:/, $_);
1218
1219	# get id and text
1220	$thesid = shift @fields;
1221	$symbols = shift @fields;
1222	$symbolstothesid{$symbols} = $thesid;
1223	}
1224	close TH;
1225
1226	# 2.
1227	# Read phrases file to find thesaurus entries that already
1228	# have a phindid. Store their phind-ids for later translation,
1229	# and store their frequency for later sorting.
1230	my %thesaurustophindid;
1231	my %phindidtofrequency;
1232	my ($phindid, $freq);
1233
1234	open(IN, "<$infile");
1235
1236	while(<IN>) {
1237
1238	chomp;
1239	@fields = split(/:/, $_);
1240
1241	# phindid and symbols for this line
1242	$phindid = shift @fields;
1243	$symbols = shift @fields;
1244	$freq = shift @fields;
1245
1246	# do we have a thesaurus id corresponding to this phrase?
1247	if (defined($symbolstothesid{$symbols})) {
1248	$thesid = $symbolstothesid{$symbols};
1249	$thesaurustophindid{$thesid} = $phindid;
1250	$phindidtofrequency{$phindid} = $freq;
1251	}
1252	}
1253	close IN;
1254
1255	undef %symbolstothesid;
1256
1257	# 3.
1258	# Create phind-id numbers for remaining thesaurus entries,
1259	# and note that their frequency is 0 for later sorting.
1260	my $nextphindid = $phindid + 1;
1261
1262	open(TH, "<$thesaurusfile");
1263	while(<TH>) {
1264
1265	chomp;
1266	@fields = split(/:/, $_);
1267
1268	# read thesaurus-id and ensure it has a corresponding phind-id
1269	$thesid = shift @fields;
1270	if (!defined($thesaurustophindid{$thesid})) {
1271	$thesaurustophindid{$thesid} = $nextphindid;
1272	$phindidtofrequency{$nextphindid} = 0;
1273	$nextphindid++;
1274	}
1275	}
1276	close TH;
1277
1278	# 4.
1279	# Translate thesaurus file, replacing thesaurus-id numbers with
1280	# phind-id numbers.
1281	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1282	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1283	my (@links, $linkid, %linkidtotype, $newrelation);
1284
1285	open(TH, "<$thesaurusfile");
1286	open(TO, ">$newthesaurusfile");
1287	while(<TH>) {
1288
1289	chomp;
1290	@fields = split(/:/, $_);
1291
1292	# phindid and symbols for this line
1293	($thesid, $symbols, $relations) = @fields;
1294
1295	die unless ($thesid && $symbols);
1296	die unless $thesaurustophindid{$thesid};
1297	$phindid = $thesaurustophindid{$thesid};
1298
1299	# convert each part of the relation string to use phind-id numbers
1300	# at the same time, we want to sort the list by frequency.
1301	undef %linkidtotype;
1302
1303	foreach $linktext (split(/;/, $relations)) {
1304	@linkdata = split(/,/, $linktext);
1305
1306	# remember the linktype (e.g. BT, NT)
1307	$linktype = shift @linkdata;
1308
1309	# store the type of each link
1310	foreach $thesid (@linkdata) {
1311	die unless (defined($thesaurustophindid{$thesid}));
1312	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1313	}
1314	}
1315
1316	# sort the list of links, first by frequency, then by type.
1317	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1318	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1319	$linkcounter = (scalar @links);
1320
1321	# create a string describing the link information
1322	$linktype = $linkidtotype{$links[0]};
1323	$newrelation = $linktype;
1324	foreach $linkid (@links) {
1325	if ($linkidtotype{$linkid} ne $linktype) {
1326	$linktype = $linkidtotype{$linkid};
1327	$newrelation .= ";" . $linktype;
1328	}
1329	$newrelation .= "," . $linkid;
1330	}
1331	$newrelation .= ";";
1332
1333
1334	# output the new line
1335	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1336	}
1337	close TH;
1338	close TO;
1339
1340	undef %thesaurustophindid;
1341	undef %linkidtotype;
1342	undef %phindidtofrequency;
1343
1344	# 5.
1345	# Read thesaurus data (in phind-id format) into memory
1346	my %thesaurusdata;
1347
1348	open(TH, "<$newthesaurusfile");
1349	while(<TH>) {
1350	chomp;
1351	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1352	die unless ($phindid && $symbols);
1353	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1354	}
1355	close TH;
1356
1357	# 6.
1358	# Add thesaurus data to phrases file
1359	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1360	my (@documwents, @newexp, $k, $n);
1361	my $linenumber = 0;
1362
1363	open(IN, "<$infile");
1364	open(OUT, ">$outfile");
1365
1366	# Update existing phrases
1367	while(<IN>) {
1368
1369	chomp;
1370	@fields = split(/:/, $_);
1371
1372	# get data for this line
1373	$phindid = shift @fields;
1374
1375	# output the phrase data, with thesaurus information
1376	print OUT "$phindid:", join(":", @fields);
1377
1378	# add thesaurus data
1379	if (defined($thesaurusdata{$phindid})) {
1380	@fields = split(/:/, $thesaurusdata{$phindid});
1381	shift @fields;
1382	$linkcounter = shift @fields;
1383	$relations = shift @fields;
1384
1385	print OUT ":$linkcounter:$relations";
1386	$thesaurusdata{$phindid} = "";
1387	}
1388	print OUT "\n";
1389	}
1390	close IN;
1391
1392	# Add phrases that aren't already in the file
1393	foreach $phindid (sort numerically keys %thesaurusdata) {
1394	next unless ($thesaurusdata{$phindid});
1395
1396	@fields = split(/:/, $thesaurusdata{$phindid});
1397	$symbols = shift @fields;
1398	$linkcounter = shift @fields;
1399	$relations = shift @fields;
1400
1401	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1402	}
1403	close OUT;
1404
1405	}
1406
1407	# restore_vocabulary_data
1408	#
1409	# Read phrases.3 and restore vocabulary information. Then write
1410	# this data to the MGPP input files (pword.txt and pdata.txt) and
1411	# (if requested) to the saved phrases file.
1412
1413	sub restore_vocabulary_data {
1414	my ($self) = @_;
1415
1416	my $out = $self->{'outhandle'};
1417	my $verbosity = $self->{'verbosity'};
1418	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1419
1420	my $phinddir = $self->{'phinddir'};
1421	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1422	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1423	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1424	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1425
1426	my $savephrases = $self->{'savephrases'};
1427
1428	# 1.
1429	# Read the vocabulary file
1430	open(V, "<$vocabfile")
1431	\|\| die "Cannot open $vocabfile: $!";
1432	my @symbol;
1433	my $i = 1;
1434	while(<V>) {
1435	chomp;
1436	$symbol[$i++] = $_;
1437	}
1438	close V;
1439
1440	# 2.
1441	# Translate phrases.3 to MGPP input files
1442	my ($key, $text, $word, $isThesaurus);
1443	my @fields;
1444	my $linenumber = 0;
1445
1446	open(IN, "<$infile");
1447	open(DATA, ">$datafile");
1448	open(WORD, ">$wordfile");
1449
1450	# Save the phrases in a separate text file
1451	if ($savephrases) {
1452	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1453	open(SAVE, ">$savephrases");
1454	}
1455
1456	while(<IN>) {
1457
1458	# read the line
1459	chomp;
1460	$line = $_;
1461	@fields = split(/:/, $line);
1462
1463	# get a phrase number for this line
1464	$key = shift @fields;
1465
1466	# restore the text of the phrase
1467	$text = shift @fields;
1468	$text =~ s/s(\d+)/$symbol[$1]/g;
1469	if ($text =~ / /) {
1470	$word = "";
1471	} elsif ($text ne 'untranslated') {
1472	$word = $text;
1473	}
1474
1475	# output the phrase data
1476	print DATA "<Document>";
1477	print DATA "$key:$text:", join(":", @fields), ":\n";
1478
1479	# output the word index search data
1480	print WORD "<Document>$word\n";
1481
1482	# output the phrases to a text file
1483	if ($savephrases) {
1484	if ((scalar @fields) == 7) {
1485	$isThesaurus = 1;
1486	} else {
1487	$isThesaurus = 0;
1488	}
1489	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1490	}
1491	}
1492	close IN;
1493	close WORD;
1494	close DATA;
1495	close SAVE if ($savephrases);
1496
1497	}
1498
1499
1500
1501	# sort routines used to renumber phrases
1502
1503	sub numerically { $a <=> $b }
1504
1505	sub by_doc_frequency {
1506	my $fa = 1;
1507	if ($a =~ /,/) {
1508	$fa = $a;
1509	$fa =~ s/\d+,//;
1510	}
1511	my $fb = 1;
1512	if ($b =~ /,/) {
1513	$fb = $b;
1514	$fb =~ s/\d+,//;
1515	}
1516
1517	return ($fb <=> $fa);
1518	}
1519
1520	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: