Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 10253

Last change on this file since 10253 was 10253, checked in by kjdon, 19 years ago
added 'use strict' to all classifiers, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 40.3 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BasClas;
37	use util;
38	use ghtml;
39	use unicode;
40
41	use strict;
42	no strict 'refs'; # allow filehandles to be variables and viceversa
43
44	my @removedirs = ();
45
46	my %wanted_index_files = ('td'=>1,
47	't'=>1,
48	'ti'=>1,
49	'tl'=>1,
50	'tsd'=>1,
51	'idb'=>1,
52	'ib1'=>1,
53	'ib2'=>1,
54	'ib3'=>1,
55	'i'=>1,
56	'il'=>1,
57	'w'=>1,
58	'wa'=>1);
59
60	sub BEGIN {
61	@Phind::ISA = ('BasClas');
62	}
63
64	sub END {
65
66	# Tidy up stray files - we do this here as there's some weird problem
67	# preventing us from doing it in the get_classify_info() function (on
68	# windows at least) where the close() appears to fail on txthandle and
69	# dochandle, thus preventing us from deleting those files
70
71	foreach my $dir (@removedirs) {
72	if (-d $dir && opendir (DIR, $dir)) {
73	my @files = readdir DIR;
74	closedir DIR;
75
76	foreach my $file (@files) {
77	next if $file =~ /^\.\.?$/;
78	my ($suffix) = $file =~ /\.([^\.]+)$/;
79	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
80	# delete it!
81	&util::rm (&util::filename_cat ($dir, $file));
82	}
83	}
84	}
85	}
86	}
87
88	my $arguments =
89	[ { 'name' => "text",
90	'desc' => "{Phind.text}",
91	'type' => "string",
92	'deft' => "section:Title,section:text",
93	'reqd' => "no" },
94	{ 'name' => "title",
95	'desc' => "{Phind.title}",
96	'type' => "metadata",
97	'deft' => "Title",
98	'reqd' => "no" },
99	{ 'name' => "buttonname",
100	'desc' => "{BasClas.buttonname}",
101	'type' => "string",
102	'deft' => "Phrase",
103	'reqd' => "no" },
104	{ 'name' => "language",
105	'desc' => "{Phind.language}",
106	'type' => "string",
107	'deft' => "en",
108	'reqd' => "no" },
109	{ 'name' => "savephrases",
110	'desc' => "{Phind.savephrases}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no" },
114	{ 'name' => "suffixmode",
115	'desc' => "{Phind.suffixmode}",
116	'type' => "int",
117	'deft' => "1",
118	'range' => "0,1",
119	'reqd' => "no" },
120	{ 'name' => "min_occurs",
121	'desc' => "{Phind.min_occurs}",
122	'type' => "int",
123	'deft' => "2",
124	'range' => "1,",
125	'reqd' => "no" },
126	{ 'name' => "thesaurus",
127	'desc' => "{Phind.thesaurus}",
128	'type' => "string",
129	'deft' => "",
130	'reqd' => "no" },
131	{ 'name' => "untidy",
132	'desc' => "{Phind.untidy}",
133	'type' => "flag",
134	'reqd' => "no" } ];
135
136	my $options = { 'name' => "Phind",
137	'desc' => "{Phind.desc}",
138	'abstract' => "no",
139	'inherits' => "yes",
140	'args' => $arguments };
141
142
143	# Phrase delimiter symbols - these should be abstracted out someplace
144
145	my $colstart = "COLLECTIONSTART";
146	my $colend = "COLLECTIONEND";
147	my $doclimit = "DOCUMENTLIMIT";
148	my $senlimit = "SENTENCELIMIT";
149	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
150
151
152	# Create a new Phind browser based on collect.cfg
153
154	sub new {
155	my ($class) = shift (@_);
156	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
157	push(@$classifierslist, $class);
158
159	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
160	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
161
162	my $self = (defined $hashArgOptLists)? new BasClas($classifierslist,$inputargs,$hashArgOptLists): new BasClas($classifierslist,$inputargs);
163
164	if ($self->{'info_only'}) {
165	# don't worry about any options etc
166	return bless $self, $class;
167	}
168
169	# Ensure the Phind generate scripts are in place
170	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
171	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
172	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
173	if (!(-e $file1)) {
174	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
175	exit(1);
176	}
177
178	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
179	$self->{"indexes"} = $self->{"text"};
180
181	# Further setup
182	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
183	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
184	if (! defined $self->{'builddir'}) {
185	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
186	}
187	$self->{'total'} = 0;
188
189	# Clean out the unused keys
190	delete $self->{"text"};
191
192	return bless $self, $class;
193	}
194
195
196	# Initialise the Phind classifier
197
198	sub init {
199	my $self = shift (@_);
200
201	# ensure we have a build directory
202	my $builddir = $self->{'builddir'};
203	die unless (-e "$builddir");
204
205	# create Phind directory
206	my $phnumber = 1;
207	my $phinddir = &util::filename_cat($builddir, "phind1");
208	while (-e "$phinddir") {
209	$phnumber++;
210	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
211	}
212	&util::mk_dir("$phinddir");
213	$self->{'phinddir'} = $phinddir;
214	$self->{'phindnumber'} = $phnumber;
215
216	push(@removedirs, $phinddir) unless $self->{'untidy'};
217
218	# open filehandles for documents and text
219	my $clausefile = &util::filename_cat("$phinddir", "clauses");
220	&util::rm($clausefile) if (-e $clausefile);
221
222	my $txthandle = 'TEXT' . $phnumber;
223	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
224	$self->{'txthandle'} = $txthandle;
225
226	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
227	&util::rm($docfile) if (-e $docfile);
228
229	my $dochandle = 'DOC' . $phnumber;
230	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
231	$self->{'dochandle'} = $dochandle;
232
233	}
234
235
236	# Classify each document.
237	#
238	# Each document is passed here in turn. The classifier extracts the
239	# text of each and stores it in the clauses file. Document details are
240	# stored in the docs.txt file.
241
242	sub classify {
243	my $self = shift (@_);
244	my $doc_obj = shift @_;
245
246	my $verbosity = $self->{'verbosity'};
247	my $top_section = $doc_obj->get_top_section();
248
249	my $titlefield = $self->{'title'};
250
251	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
252	if (!defined($title)) {
253	$title = "";
254	print STDERR "Phind: document has no title\n";
255	}
256	print "process: $title\n" if ($verbosity > 2);
257
258	# Only consider the file if it is in the correct language
259	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
260	my $phrlanguage = $self->{'language'};
261	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
262
263	# record this file
264	$self->{'total'} ++;
265	# what is $file ???
266	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
267
268	# Store document details
269	my $OID = $doc_obj->get_OID();
270	$OID = "NULL" unless defined $OID;
271	my $dochandle = $self->{'dochandle'};
272	print $dochandle "<Document>\t$OID\t$title\n";
273
274	# Store the text occuring in this object
275
276	# output the document delimiter
277	my $txthandle = $self->{'txthandle'};
278	print $txthandle "$doclimit\n";
279
280	# iterate over the required indexes and store their text
281	my $indexes = $self->{'indexes'};
282	my $text = "";
283	my ($part, $level, $field, $section, $data, $dataref);
284
285	foreach $part (split(/,/, $indexes)) {
286
287	# Each field has a level and a data element ((e.g. document:Title)
288	($level, $field) = split(/:/, $part);
289	die unless ($level && $field);
290
291	# Extract the text from every section
292	# (In phind, document:text and section:text are equivalent)
293	if ($field eq "text") {
294	$data = "";
295	$section = $doc_obj->get_top_section();
296	while (defined($section)) {
297	$data .= $doc_obj->get_text($section) . "\n";
298	$section = $doc_obj->get_next_section($section);
299	}
300	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
301	}
302
303	# Extract a metadata field from a document
304	# (If there is more than one element of the given type, get them all.)
305	elsif ($level eq "document") {
306	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
307	foreach $data (@$dataref) {
308	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
309	}
310	}
311
312	# Extract metadata from every section in a document
313	elsif ($level eq "section") {
314	$data = "";
315	$section = $doc_obj->get_top_section();
316	while (defined($section)) {
317	$dataref = $doc_obj->get_metadata($section, $field);
318	$data .= join("\n", @$dataref) . "\n";
319	$section = $doc_obj->get_next_section($section);
320	}
321	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
322	}
323
324	# Some sort of specification which I don't understand
325	else {
326	die "Unknown level ($level) in Phind index ($part)\n";
327	}
328
329	}
330
331	# output the text
332	$text =~ tr/\n//s;
333	print $txthandle "$text";
334	}
335
336
337	# Construct the classifier from the information already gathered
338	#
339	# When get_classify_info is called, the clauses and docs.txt files have
340	# already been constructed in the Phind directory. This function will
341	# translate them into compressed, indexed MGPP files that can be read by
342	# the phindcgi script. It will also register our classifier so that it
343	# shows up in the navigation bar.
344
345	sub get_classify_info {
346	my $self = shift (@_);
347	my ($gli) = @_;
348
349	close $self->{'dochandle'};
350	close $self->{'txthandle'};
351	my $verbosity = $self->{'verbosity'};
352	my $out = $self->{'outhandle'};
353	my $phinddir = $self->{'phinddir'};
354
355	my $osextra = "";
356	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
357	$osextra = " -d /";
358	}
359
360	print STDERR "</Stage>\n" if $gli;
361
362	if ($verbosity) {
363	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
364	print $out "*** in ", $self->{'phinddir'}, "\n";
365	}
366
367	print STDERR "<Stage name='Phind'>\n" if $gli;
368
369	# Construct phind indexes
370	my $suffixmode = $self->{'suffixmode'};
371	my $min_occurs = $self->{'min_occurs'};
372	my ($command, $status);
373
374	# Generate the vocabulary, symbol statistics, and numbers file
375	# from the clauses file
376	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
377	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
378	&extract_vocabulary($self);
379
380	# Use the suffix program to generate the phind/phrases file
381	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
382	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
383	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
384
385	# check that we generated some files. It's not necessarily an error if
386	# we didn't (execute() would have quit on error), but we can't go on.
387	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
388	if (! -r $phrasesfile) {
389	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
390	print $out "\nNo phrases found for Phind classifier!\n";
391	return;
392	}
393
394	# Create the phrase file and put phrase numbers in phind/phrases
395	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
396	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
397	&renumber_phrases($self);
398
399	print $out "\nCreating phrase databases\n";
400	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
401	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
402	my $mg_stem = &util::filename_cat($phinddir, "pdata");
403
404	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
405	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
406	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
407
408	# create the mg index of words
409	print $out "\nCreating word-level search indexes\n";
410	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
411	$mg_input = &util::filename_cat($phinddir, "pword.txt");
412	$mg_stem = &util::filename_cat($phinddir, "pword");
413
414	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
415	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
416	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
417	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
418	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
419	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
420
421	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
422	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
423	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
424
425	# create the mg document information database
426	print $out "\nCreating document information databases\n";
427	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
428	$mg_input = &util::filename_cat($phinddir, "docs.txt");
429	$mg_stem = &util::filename_cat($phinddir, "docs");
430
431	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
432	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
433	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
434
435	# Return the information about the classifier that we'll later want to
436	# use to create macros when the Phind classifier document is displayed.
437	my %classifyinfo = ('thistype'=>'Invisible',
438	'childtype'=>'Phind',
439	'Title'=>$self->{'buttonname'},
440	'parameters'=>"phindnumber=$self->{'phindnumber'}",
441	'contains'=>[]);
442
443	my $collection = $self->{'collection'};
444	my $url = "library?a=p&p=phind&c=$collection";
445	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
446
447	return \%classifyinfo;
448	}
449
450
451
452	sub convert_gml_to_tokens {
453
454	my ($language_exp, $text) = @_;
455
456	# escape any magic words... - jrm21
457	foreach my $delim (@delimiters) {
458	my $replacement=lc($delim);
459	my $num= $text=~ s/$delim/$replacement/g;
460	if (!$num) {$num=0;}
461	}
462
463	if ($language_exp =~ /en/) {
464	return &convert_gml_to_tokens_EN($text);
465	}
466
467	if ($language_exp =~ /zh/) {
468	return &convert_gml_to_tokens_ZH($text);
469	}
470
471	$_ = $text;
472
473	# 1. remove GML tags
474
475	# Remove everything that is in a tag
476	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
477	s/\s<br>\s/ LINEBREAK /isgo;
478	s/<[^>]*>/ /sgo;
479
480	# Now we have the text, but it may contain HTML
481	# elements coded as > etc. Remove these tags.
482	s/&/&/sgo;
483	s/</</sgo;
484	s/>/>/sgo;
485	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
486	s/\s<br>\s/ LINEBREAK /isgo;
487	s/<[^>]*>/ /sgo;
488
489	# replace<p> and <br> placeholders with clause break symbol (\n)
490	s/\s+/ /gso;
491	s/PARAGRAPHBREAK/\n/sgo;
492	s/LINEBREAK/\n/sgo;
493
494
495
496
497	# 2. Split the remaining text into space-delimited tokens
498
499	# Convert any HTML special characters (like ") to their UTF8 equivalent
500	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
501
502	# Split text at word boundaries
503	s/\b/ /go;
504
505	# 3. Convert the remaining text to "clause format"
506
507	# Insert newline if the end of a sentence is detected
508	# (delimter is: "[\.\?\!]\s")
509	# s/\s*[\.\?\!]\s+/\n/go;
510
511	# remove unnecessary punctuation and replace with clause break symbol (\n)
512	s/[^\w ]/\n/go;
513
514	# remove extraneous whitespace
515	s/ +/ /sgo;
516	s/^\s+//mgo;
517	s/\s*$/\n/mgo;
518
519	# remove lines that contain one word or less
520	s/^\S*$//mgo;
521	s/^\s*$//mgo;
522	tr/\n//s;
523
524	return $_;
525	}
526
527	# a chinese version
528	sub convert_gml_to_tokens_ZH {
529
530	$_ = shift @_;
531
532	# Replace all whitespace with a simple space
533	s/\s+/ /gs;
534	# Remove everything that is in a tag
535	s/\s<p>\s/ PARAGRAPHBREAK /isg;
536	s/\s<br>\s/ LINEBREAK /isg;
537	s/<[^>]*>/ /sg;
538
539	# Now we have the text, but it may contain HTML
540	# elements coded as > etc. Remove these tags.
541	s/</</sg;
542	s/>/>/sg;
543
544	s/\s+/ /sg;
545	s/\s<p>\s/ PARAGRAPHBREAK /isg;
546	s/\s<br>\s/ LINEBREAK /isg;
547	s/<[^>]*>/ /sg;
548
549	# remove & and other miscellaneous markup tags
550	s/&/&/sg;
551	s/</</sg;
552	s/>/>/sg;
553	s/&/&/sg;
554
555	# replace<p> and <br> placeholders with carriage returns
556	s/PARAGRAPHBREAK/\n/sg;
557	s/LINEBREAK/\n/sg;
558
559
560	# print STDERR "text:$_\n";
561	return $_;
562	}
563
564	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
565
566	sub convert_gml_to_tokens_EN {
567	$_ = shift @_;
568
569	# FIRST, remove GML tags
570
571	# Replace all whitespace with a simple space
572	s/\s+/ /gs;
573
574	# Remove everything that is in a tag
575	s/\s<p>\s/ PARAGRAPHBREAK /isg;
576	s/\s<br>\s/ LINEBREAK /isg;
577	s/<[^>]*>/ /sg;
578
579	# Now we have the text, but it may contain HTML
580	# elements coded as > etc. Remove these tags.
581	s/</</sg;
582	s/>/>/sg;
583
584	s/\s+/ /sg;
585	s/\s<p>\s/ PARAGRAPHBREAK /isg;
586	s/\s<br>\s/ LINEBREAK /isg;
587	s/<[^>]*>/ /sg;
588
589	# remove & and other miscellaneous markup tags
590	s/&/&/sg;
591	s/</</sg;
592	s/>/>/sg;
593	s/&/&/sg;
594
595	# replace<p> and <br> placeholders with carriage returns
596	s/PARAGRAPHBREAK/\n/sg;
597	s/LINEBREAK/\n/sg;
598
599
600	# Exceptional punctuation
601	#
602	# We make special cases of some punctuation
603
604	# remove any apostrophe that indicates omitted letters
605	s/(\w+)\'(\w*\s)/ $1$2 /g;
606
607	# remove period that appears in a person's initals
608	s/\s([A-Z])\./ $1 /g;
609
610	# replace hyphens in hypheanted words and names with a space
611	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
612
613	# Convert the remaining text to "clause format",
614	# This means removing all excess punctuation and garbage text,
615	# normalising valid punctuation to fullstops and commas,
616	# then putting one cluse on each line.
617
618	# Insert newline when the end of a sentence is detected
619	# (delimter is: "[\.\?\!]\s")
620	s/\s*[\.\?\!]\s+/\n/g;
621
622	# split numbers after four digits
623	s/(\d\d\d\d)/$1 /g;
624
625	# split words after 32 characters
626
627	# squash repeated punctuation
628	tr/A-Za-z0-9 //cs;
629
630	# save email addresses
631	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
632
633	# normalise clause breaks (mostly punctuation symbols) to commas
634	s/[^A-Za-z0-9 \n]+/ , /g;
635
636	# Remove repeated commas, and replace with newline
637	s/\s*,[, ]+/\n/g;
638
639	# remove extra whitespace
640	s/ +/ /sg;
641	s/^\s+//mg;
642	s/\s*$/\n/mg;
643
644	# remove lines that contain one word or less
645	s/^\w*$//mg;
646	s/^\s*$//mg;
647	tr/\n//s;
648
649	return $_;
650
651	}
652
653
654
655	# Execute a system command
656
657	sub execute {
658	my ($command, $verbosity, $outhandle) = @_;
659	print $outhandle "Executing: $command\n" if ($verbosity > 2);
660	$! = 0;
661	my $status = system($command);
662	if ($status != 0) {
663	print STDERR "Phind - Error executing '$command': $!\n";
664	exit($status); # this causes the build to fail...
665	}
666	}
667
668
669	# Generate the vocabulary, symbol statistics, and numbers file from the
670	# clauses file. This is legacy code, so is a bit messy and probably wont
671	# run under windows.
672
673	sub extract_vocabulary {
674	my ($self) = @_;
675
676	my $verbosity = $self->{'verbosity'};
677	my $out = $self->{'outhandle'};
678
679	my $collectiondir = $self->{'collectiondir'};
680	my $phinddir = $self->{'phinddir'};
681
682	my $language_exp = $self->{'language'};
683
684	my ($w, $l, $line, $word);
685
686	my ($first_delimiter, $last_delimiter,
687	$first_stopword, $last_stopword,
688	$first_extractword, $last_extractword,
689	$first_contentword, $last_contentword,
690	$phrasedelimiter);
691
692	my $thesaurus = $self->{'thesaurus'};
693	my ($thesaurus_links, $thesaurus_terms,
694	%thesaurus, $first_thesaurusword, $last_thesaurusword);
695
696	my %symbol;
697	my (%freq);
698
699	print $out "Calculating vocabulary\n" if ($verbosity > 1);
700
701	# Read and store the stopwords
702	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
703	my $stopword_files = ();
704	my ($language, $language_dir, $file, $file_name);
705	my %stopwords;
706
707	# Examine each directory in the stopword directory
708	opendir(STOPDIR, $stopdir);
709	foreach $language (readdir STOPDIR) {
710
711	# Ignore entries that do not match the classifier's language
712	next unless ($language =~ /$language_exp/);
713	$language_dir = &util::filename_cat($stopdir, $language);
714	next unless (-d "$language_dir");
715
716	opendir(LANGDIR, $language_dir);
717	foreach $file (readdir LANGDIR) {
718
719	# Ignore entries that are not stopword files
720	next unless ($file =~ /sw$/);
721	$file_name = &util::filename_cat($language_dir, $file);
722	next unless (-f "$file_name");
723
724	# Read the stopwords
725	open(STOPFILE, "<$file_name");
726	while (<STOPFILE>) {
727	s/^\s+//;
728	s/\s.*//;
729	$word = $_;
730	$l = lc($word);
731	$stopwords{$l} = $word;
732	}
733	close STOPFILE;
734
735	}
736	closedir LANGDIR;
737	}
738	closedir STOPDIR;
739
740	# Read thesaurus information
741	if ($thesaurus) {
742
743	# link file exists
744	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
745	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
746
747	# ensure term file exists in the correct language
748	if ($language_exp =~ /^([a-z][a-z])/) {
749	$language = $1;
750	} else {
751	$language = 'en';
752	}
753	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
754	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
755
756
757	# Read the thesaurus terms
758	open(TH, "<$thesaurus_terms");
759	while(<TH>) {
760	s/^\d+ //;
761	s/\(.*\)//;
762	foreach $w (split(/\s+/, $_)) {
763	$thesaurus{lc($w)} = $w;
764	}
765	}
766	close TH;
767	}
768
769	# Read words in the text and count occurences
770	open(TXT, "<$phinddir/clauses");
771
772	my @words;
773	while(<TXT>) {
774	$line = $_;
775	next unless ($line =~ /./);
776
777	@words = split(/\s+/, $line);
778	foreach $w (@words) {
779	$l = lc($w);
780	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
781	$freq{$w}++;
782	}
783	$freq{$senlimit}++;
784	}
785
786	close TXT;
787
788	# Calculate the "best" form of each word
789	my (%bestform, %totalfreq, %bestfreq);
790
791	foreach $w (sort (keys %freq)) {
792	$l = lc($w);
793
794	# totalfreq is the number of times a term appears in any form
795	$totalfreq{$l} += $freq{$w};
796
797	if (defined $stopwords{$l}) {
798	$bestform{$l} = $stopwords{$l};
799
800	} elsif (defined $thesaurus{$l}) {
801	$bestform{$l} = $thesaurus{$l};
802
803	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
804	$bestfreq{$l} = $freq{$w};
805	$bestform{$l} = $w;
806	}
807	}
808	undef %freq;
809	undef %bestfreq;
810
811
812	# Assign symbol numbers to tokens
813	my $nextsymbol = 1;
814	my (@vocab);
815
816	# Delimiters
817	$first_delimiter = 1;
818
819	foreach $word (@delimiters) {
820
821	# $word = lc($word); # jrm21
822	$word = uc($word);
823	$bestform{$word} = $word;
824	$vocab[$nextsymbol] = $word;
825	$symbol{$word} = $nextsymbol;
826	$nextsymbol++;
827	}
828	$last_delimiter = $nextsymbol - 1;
829	# Stopwords
830	$first_stopword = $nextsymbol;
831
832	foreach my $word (sort keys %stopwords) {
833	# don't include stopword unless it occurs in the text
834	$word = lc($word);
835	next unless ($totalfreq{$word});
836	next if ($symbol{$word});
837
838	$vocab[$nextsymbol] = $word;
839	$symbol{$word} = $nextsymbol;
840	$nextsymbol++;
841	}
842	$last_stopword = $nextsymbol - 1;
843	$first_contentword = $nextsymbol;
844
845	# Thesaurus terms
846	if ($thesaurus) {
847	$first_thesaurusword = $nextsymbol;
848
849	foreach my $word (sort keys %thesaurus) {
850
851	$word = lc($word);
852	next if ($symbol{$word});
853	$bestform{$word} = $thesaurus{$word};
854
855	$vocab[$nextsymbol] = $word;
856	$symbol{$word} = $nextsymbol;
857	$nextsymbol++;
858
859	}
860	$last_thesaurusword = $nextsymbol - 1;
861	}
862
863	# Other content words
864	$first_extractword = $nextsymbol;
865
866	foreach my $word (sort (keys %bestform)) {
867
868	next if ($symbol{$word});
869
870	$vocab[$nextsymbol] = $word;
871	$symbol{$word} = $nextsymbol;
872	$nextsymbol++;
873	}
874	$last_extractword = $nextsymbol - 1;
875	$last_contentword = $nextsymbol - 1;
876
877	# Outut the words
878	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
879	open(VOC, ">$phinddir/clauses.vocab");
880
881	for (my $i = 1; $i < $nextsymbol; $i++) {
882	$w = $vocab[$i];
883
884	print VOC "$bestform{$w}\n";
885	$totalfreq{$w} = 0 unless ($totalfreq{$w});
886	}
887	close VOC;
888
889
890	# Create statistics file
891	# Output statistics about the vocablary
892	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
893	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
894
895	open(STAT, ">$phinddir/clauses.stats")
896	\|\| die "Cannot open $phinddir/clauses.stats: $!";
897
898	print STAT "first_delimiter $first_delimiter\n";
899	print STAT "last_delimiter $last_delimiter\n";
900	print STAT "first_stopword $first_stopword\n";
901	print STAT "last_stopword $last_stopword\n";
902	if ($thesaurus) {
903	print STAT "first_thesaurusword $first_thesaurusword\n";
904	print STAT "last_thesaurusword $last_thesaurusword\n";
905	}
906	print STAT "first_extractword $first_extractword\n";
907	print STAT "last_extractword $last_extractword\n";
908	print STAT "first_contentword $first_contentword\n";
909	print STAT "last_contentword $last_contentword\n";
910	print STAT "first_symbol $first_delimiter\n";
911	print STAT "last_symbol $last_contentword\n";
912	print STAT "first_word $first_stopword\n";
913	print STAT "last_word $last_contentword\n";
914	close STAT;
915
916	undef @vocab;
917
918
919	# Create numbers file
920	# Save text as symbol numbers
921	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
922
923	open(TXT, "<$phinddir/clauses");
924	open(NUM, ">$phinddir/clauses.numbers");
925
926	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
927	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
928	$phrasedelimiter = $symbol{$senlimit};
929	print NUM "$symbol{$colstart}\n";
930
931	# set up the special symbols that delimit documents and sentences
932	while(<TXT>) {
933
934	# split sentence into a list of tokens
935	$line = $_;
936	next unless ($line =~ /./);
937	@words = split(/\s+/, $line);
938
939	# output one token at a time
940	foreach $word (@words) {
941	# don't lower-case special delimiters - jrm21
942	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
943	$word = lc($word);
944	}
945	print NUM "$symbol{$word}\n";
946	}
947
948	# output phrase delimiter
949	print NUM "$phrasedelimiter\n";
950	}
951
952	close TXT;
953	# print NUM "$symbol{lc($colend)}\n";# jrm21
954	print NUM "$symbol{$colend}\n";
955	close NUM;
956
957	# Save thesaurus data in one convienient file
958	if ($thesaurus) {
959
960	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
961
962
963	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
964	if ($verbosity > 1);
965
966	# Read the thesaurus terms
967	my ($num, $text, %thes_symbols);
968
969	open(TH, "<$thesaurus_terms");
970	while(<TH>) {
971	chomp;
972	@words = split(/\s+/, $_);
973	$num = shift @words;
974	$text = "";
975
976	# translate words into symbol numbers
977	foreach $word (@words) {
978	$word = lc($word);
979	if ($symbol{$word}) {
980	$text .= "s$symbol{$word} ";
981	} elsif ($verbosity) {
982	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
983	}
984	}
985	$text =~ s/ $//;
986	$thes_symbols{$num} = $text;
987	}
988	close TH;
989
990	# Read the thesaurus links and write the corresponding data
991	open(TH, "<$thesaurus_links");
992	open(THOUT, ">$thesaurusfile");
993
994	while(<TH>) {
995	chomp;
996	($num, $text) = split(/:/, $_);
997
998	if (defined($thes_symbols{$num})) {
999	print THOUT "$num:$thes_symbols{$num}:$text\n";
1000	} else {
1001	print THOUT "$num:untranslated:$text\n";
1002	}
1003	}
1004	close TH;
1005	close THOUT;
1006	}
1007
1008
1009
1010
1011	}
1012
1013
1014	# renumber_phrases
1015	#
1016	# Prepare the phrases file to be input to mgpp. The biggest problem is
1017	# reconciling the phrase identifiers used by the suffix program (which
1018	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1019	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1020	# Phind-id numbers must be sorted by frequency of occurance.
1021	#
1022	# Start creating a set of phind-id numbers from the sorted suffix-id
1023	# numbers and (if required) the thesaurus-id numbers. Then add any other
1024	# phrases occuring in the thesaurus.
1025	#
1026	# The last thing we have to do is restore the vocabulary information to the
1027	# phrase file so that the phrases are stored as words, not as symbol
1028	# numbers.
1029
1030	# The original phrases file looks something like this:
1031	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1032	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1033	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1034	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1035
1036
1037	sub renumber_phrases {
1038	my ($self) = @_;
1039
1040	renumber_suffix_data($self);
1041	renumber_thesaurus_data($self);
1042	restore_vocabulary_data($self);
1043
1044	}
1045
1046
1047
1048	# renumber_suffix_data
1049	#
1050	# Translate phrases file to phrases.2 using phind keys instead
1051	# of suffix keys and sorting the expansion data.
1052
1053	sub renumber_suffix_data {
1054	my ($self) = @_;
1055
1056	my $verbosity = $self->{'verbosity'};
1057	my $out = $self->{'outhandle'};
1058	print $out "Translate phrases: suffix-ids become phind-id's\n"
1059	if ($verbosity);
1060
1061	my $phinddir = $self->{'phinddir'};
1062	my $infile = &util::filename_cat($phinddir, 'phrases');
1063	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1064
1065	# Read the phrase file. Calculate initial set of phind-id
1066	# numbers and store (suffixid -> frequency) relation.
1067
1068	my %suffixtophind;
1069	my @totalfrequency;
1070	my (@fields, $suffixid);
1071	my $nextphind = 1;
1072
1073	open(IN, "<$infile");
1074	while(<IN>) {
1075
1076	chomp;
1077	@fields = split(/:/, $_);
1078
1079	# get next suffixid and phindid
1080	$suffixid = shift @fields;
1081	$suffixtophind{$suffixid} = $nextphind;
1082
1083	# store total frequency
1084	shift @fields;
1085	$totalfrequency[$nextphind] = shift @fields;
1086
1087	$nextphind++;
1088	}
1089	close IN;
1090
1091
1092	# Translate phrases file to phrases.2. Use phind keys (not suffix
1093	# keys), sort expansion and document occurance data in order of
1094	# descending frequency..
1095	open(IN, "<$infile");
1096	open(OUT, ">$outfile");
1097
1098	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1099	my (@documwents, @newexp, $k, $n);
1100	my $linenumber = 0;
1101
1102	while(<IN>) {
1103
1104	# read the line
1105	chomp;
1106	@fields = split(/:/, $_);
1107
1108	# get a phrase number for this line
1109	$suffixid = shift @fields;
1110	die unless (defined($suffixtophind{$suffixid}));
1111	$phindid = $suffixtophind{$suffixid};
1112
1113	# get the symbols in the phrase
1114	$text = shift @fields;
1115
1116	# output status information
1117	$linenumber++;
1118	if ($verbosity > 2) {
1119	if ($linenumber % 1000 == 0) {
1120	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1121	}
1122	# what are $num and $key??
1123	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1124	}
1125
1126	# get the phrase frequency
1127	$tf = shift @fields;
1128
1129	# get the number of expansions
1130	$countexp = shift @fields;
1131
1132	# get the expansions, convert them into phind-id numbers, and sort them
1133	$expansions = shift @fields;
1134	@newexp = ();
1135	foreach $k (split(/,/, $expansions)) {
1136	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1137	$n = $suffixtophind{$k};
1138	push @newexp, $n;
1139	}
1140	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1141
1142	# get the number of documents
1143	$countdocs = shift @fields;
1144
1145	# get the documents and sort them
1146	$documents = shift @fields;
1147	$documents =~ s/d//g;
1148	my @documents = split(/;/, $documents);
1149	@documents = sort by_doc_frequency @documents;
1150
1151	# output the phrase data
1152	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1153	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1154
1155	}
1156
1157	close IN;
1158	close OUT;
1159	}
1160
1161
1162	# renumber_thesaurus_data
1163	#
1164	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1165
1166	sub renumber_thesaurus_data {
1167	my ($self) = @_;
1168
1169	my $out = $self->{'outhandle'};
1170	my $verbosity = $self->{'verbosity'};
1171	my $thesaurus = $self->{'thesaurus'};
1172
1173	my $phinddir = $self->{'phinddir'};
1174	my $infile = &util::filename_cat($phinddir, "phrases.2");
1175	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1176
1177
1178	# If no thesaurus is defined, simply move the phrases file.
1179	if (!$thesaurus) {
1180	print $out "Translate phrases.2: no thesaurus data\n"
1181	if ($verbosity);
1182	&util::mv($infile, $outfile);
1183	return;
1184	}
1185
1186	print $out "Translate phrases.2: add thesaurus data\n"
1187	if ($verbosity);
1188
1189	# 1.
1190	# Read thesaurus file and store (symbols->thesaurusid) mapping
1191	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1192	my %symbolstothesid;
1193	my (@fields, $thesid, $symbols);
1194
1195	open(TH, "<$thesaurusfile");
1196
1197	while (<TH>) {
1198
1199	chomp;
1200	@fields = split(/:/, $_);
1201
1202	# get id and text
1203	$thesid = shift @fields;
1204	$symbols = shift @fields;
1205	$symbolstothesid{$symbols} = $thesid;
1206	}
1207	close TH;
1208
1209	# 2.
1210	# Read phrases file to find thesaurus entries that already
1211	# have a phindid. Store their phind-ids for later translation,
1212	# and store their frequency for later sorting.
1213	my %thesaurustophindid;
1214	my %phindidtofrequency;
1215	my ($phindid, $freq);
1216
1217	open(IN, "<$infile");
1218
1219	while(<IN>) {
1220
1221	chomp;
1222	@fields = split(/:/, $_);
1223
1224	# phindid and symbols for this line
1225	$phindid = shift @fields;
1226	$symbols = shift @fields;
1227	$freq = shift @fields;
1228
1229	# do we have a thesaurus id corresponding to this phrase?
1230	if (defined($symbolstothesid{$symbols})) {
1231	$thesid = $symbolstothesid{$symbols};
1232	$thesaurustophindid{$thesid} = $phindid;
1233	$phindidtofrequency{$phindid} = $freq;
1234	}
1235	}
1236	close IN;
1237
1238	undef %symbolstothesid;
1239
1240	# 3.
1241	# Create phind-id numbers for remaining thesaurus entries,
1242	# and note that their frequency is 0 for later sorting.
1243	my $nextphindid = $phindid + 1;
1244
1245	open(TH, "<$thesaurusfile");
1246	while(<TH>) {
1247
1248	chomp;
1249	@fields = split(/:/, $_);
1250
1251	# read thesaurus-id and ensure it has a corresponding phind-id
1252	$thesid = shift @fields;
1253	if (!defined($thesaurustophindid{$thesid})) {
1254	$thesaurustophindid{$thesid} = $nextphindid;
1255	$phindidtofrequency{$nextphindid} = 0;
1256	$nextphindid++;
1257	}
1258	}
1259	close TH;
1260
1261	# 4.
1262	# Translate thesaurus file, replacing thesaurus-id numbers with
1263	# phind-id numbers.
1264	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1265	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1266	my (@links, $linkid, %linkidtotype, $newrelation);
1267
1268	open(TH, "<$thesaurusfile");
1269	open(TO, ">$newthesaurusfile");
1270	while(<TH>) {
1271
1272	chomp;
1273	@fields = split(/:/, $_);
1274
1275	# phindid and symbols for this line
1276	($thesid, $symbols, $relations) = @fields;
1277
1278	die unless ($thesid && $symbols);
1279	die unless $thesaurustophindid{$thesid};
1280	$phindid = $thesaurustophindid{$thesid};
1281
1282	# convert each part of the relation string to use phind-id numbers
1283	# at the same time, we want to sort the list by frequency.
1284	undef %linkidtotype;
1285
1286	foreach $linktext (split(/;/, $relations)) {
1287	@linkdata = split(/,/, $linktext);
1288
1289	# remember the linktype (e.g. BT, NT)
1290	$linktype = shift @linkdata;
1291
1292	# store the type of each link
1293	foreach $thesid (@linkdata) {
1294	die unless (defined($thesaurustophindid{$thesid}));
1295	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1296	}
1297	}
1298
1299	# sort the list of links, first by frequency, then by type.
1300	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1301	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1302	$linkcounter = (scalar @links);
1303
1304	# create a string describing the link information
1305	$linktype = $linkidtotype{$links[0]};
1306	$newrelation = $linktype;
1307	foreach $linkid (@links) {
1308	if ($linkidtotype{$linkid} ne $linktype) {
1309	$linktype = $linkidtotype{$linkid};
1310	$newrelation .= ";" . $linktype;
1311	}
1312	$newrelation .= "," . $linkid;
1313	}
1314	$newrelation .= ";";
1315
1316
1317	# output the new line
1318	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1319	}
1320	close TH;
1321	close TO;
1322
1323	undef %thesaurustophindid;
1324	undef %linkidtotype;
1325	undef %phindidtofrequency;
1326
1327	# 5.
1328	# Read thesaurus data (in phind-id format) into memory
1329	my %thesaurusdata;
1330
1331	open(TH, "<$newthesaurusfile");
1332	while(<TH>) {
1333	chomp;
1334	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1335	die unless ($phindid && $symbols);
1336	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1337	}
1338	close TH;
1339
1340	# 6.
1341	# Add thesaurus data to phrases file
1342	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1343	my (@documwents, @newexp, $k, $n);
1344	my $linenumber = 0;
1345
1346	open(IN, "<$infile");
1347	open(OUT, ">$outfile");
1348
1349	# Update existing phrases
1350	while(<IN>) {
1351
1352	chomp;
1353	@fields = split(/:/, $_);
1354
1355	# get data for this line
1356	$phindid = shift @fields;
1357
1358	# output the phrase data, with thesaurus information
1359	print OUT "$phindid:", join(":", @fields);
1360
1361	# add thesaurus data
1362	if (defined($thesaurusdata{$phindid})) {
1363	@fields = split(/:/, $thesaurusdata{$phindid});
1364	shift @fields;
1365	$linkcounter = shift @fields;
1366	$relations = shift @fields;
1367
1368	print OUT ":$linkcounter:$relations";
1369	$thesaurusdata{$phindid} = "";
1370	}
1371	print OUT "\n";
1372	}
1373	close IN;
1374
1375	# Add phrases that aren't already in the file
1376	foreach $phindid (sort numerically keys %thesaurusdata) {
1377	next unless ($thesaurusdata{$phindid});
1378
1379	@fields = split(/:/, $thesaurusdata{$phindid});
1380	$symbols = shift @fields;
1381	$linkcounter = shift @fields;
1382	$relations = shift @fields;
1383
1384	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1385	}
1386	close OUT;
1387
1388	}
1389
1390	# restore_vocabulary_data
1391	#
1392	# Read phrases.3 and restore vocabulary information. Then write
1393	# this data to the MGPP input files (pword.txt and pdata.txt) and
1394	# (if requested) to the saved phrases file.
1395
1396	sub restore_vocabulary_data {
1397	my ($self) = @_;
1398
1399	my $out = $self->{'outhandle'};
1400	my $verbosity = $self->{'verbosity'};
1401	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1402
1403	my $phinddir = $self->{'phinddir'};
1404	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1405	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1406	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1407	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1408
1409	my $savephrases = $self->{'savephrases'};
1410
1411	# 1.
1412	# Read the vocabulary file
1413	open(V, "<$vocabfile")
1414	\|\| die "Cannot open $vocabfile: $!";
1415	my @symbol;
1416	my $i = 1;
1417	while(<V>) {
1418	chomp;
1419	$symbol[$i++] = $_;
1420	}
1421	close V;
1422
1423	# 2.
1424	# Translate phrases.3 to MGPP input files
1425	my ($key, $text, $word, $isThesaurus, $line);
1426	my @fields;
1427	my $linenumber = 0;
1428
1429	open(IN, "<$infile");
1430	open(DATA, ">$datafile");
1431	open(WORD, ">$wordfile");
1432
1433	# Save the phrases in a separate text file
1434	if ($savephrases) {
1435	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1436	open(SAVE, ">$savephrases");
1437	}
1438
1439	while(<IN>) {
1440
1441	# read the line
1442	chomp;
1443	$line = $_;
1444	@fields = split(/:/, $line);
1445
1446	# get a phrase number for this line
1447	$key = shift @fields;
1448
1449	# restore the text of the phrase
1450	$text = shift @fields;
1451	$text =~ s/s(\d+)/$symbol[$1]/g;
1452	if ($text =~ / /) {
1453	$word = "";
1454	} elsif ($text ne 'untranslated') {
1455	$word = $text;
1456	}
1457
1458	# output the phrase data
1459	print DATA "<Document>";
1460	print DATA "$key:$text:", join(":", @fields), ":\n";
1461
1462	# output the word index search data
1463	print WORD "<Document>$word\n";
1464
1465	# output the phrases to a text file
1466	if ($savephrases) {
1467	if ((scalar @fields) == 7) {
1468	$isThesaurus = 1;
1469	} else {
1470	$isThesaurus = 0;
1471	}
1472	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1473	}
1474	}
1475	close IN;
1476	close WORD;
1477	close DATA;
1478	close SAVE if ($savephrases);
1479
1480	}
1481
1482
1483
1484	# sort routines used to renumber phrases
1485
1486	sub numerically { $a <=> $b }
1487
1488	sub by_doc_frequency {
1489	my $fa = 1;
1490	if ($a =~ /,/) {
1491	$fa = $a;
1492	$fa =~ s/\d+,//;
1493	}
1494	my $fb = 1;
1495	if ($b =~ /,/) {
1496	$fb = $b;
1497	$fb =~ s/\d+,//;
1498	}
1499
1500	return ($fb <=> $fa);
1501	}
1502
1503	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: