Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 3590

Last change on this file since 3590 was 3540, checked in by kjdon, 22 years ago
added John T's changes into CVS - added info to enable retrieval of usage info in xml
Property svn:keywords set to `Author Date Id Revision`
File size: 41.8 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl Phind" at the command line for a summary.
32
33	# 12/05/02 Added usage datastructure - John Thompson
34
35	package Phind;
36
37	use BasClas;
38	use util;
39	use ghtml;
40	use unicode;
41
42	my @removedirs = ();
43
44	my %wanted_index_files = ('td'=>1,
45	't'=>1,
46	'ti'=>1,
47	'tl'=>1,
48	'tsd'=>1,
49	'idb'=>1,
50	'ib1'=>1,
51	'ib2'=>1,
52	'ib3'=>1,
53	'i'=>1,
54	'il'=>1,
55	'w'=>1,
56	'wa'=>1);
57
58	sub BEGIN {
59	@ISA = ('BasClas');
60	}
61
62	sub END {
63
64	# Tidy up stray files - we do this here as there's some weird problem
65	# preventing us from doing it in the get_classify_info() function (on
66	# windows at least) where the close() appears to fail on txthandle and
67	# dochandle, thus preventing us from deleting those files
68
69	foreach my $dir (@removedirs) {
70	if (-d $dir && opendir (DIR, $dir)) {
71	my @files = readdir DIR;
72	closedir DIR;
73
74	foreach $file (@files) {
75	next if $file =~ /^\.\.?$/;
76	my ($suffix) = $file =~ /\.([^\.]+)$/;
77	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
78	# delete it!
79	&util::rm (&util::filename_cat ($dir, $file));
80	}
81	}
82	}
83	}
84	}
85
86	my $arguments =
87	[ { 'name' => "text",
88	'desc' => "The text used to build the phrase hierarchy (default: 'section:Title,section:text').",
89	'type' => "string",
90	'reqd' => "no" } ,
91	{ 'name' => "title",
92	'desc' => "The metadata field used to describe each document (default: 'Title').",
93	'type' => "metadata",
94	'reqd' => "no" } ,
95	{ 'name' => "button",
96	'desc' => "The label for the classifier screen and button in navigation bar (default: 'Phrase').",
97	'type' => "string",
98	'reqd' => "no" } ,
99	{ 'name' => "language",
100	'desc' => "Language or languages to use building hierarchy. Languages are identified by two-letter country codes like en (English), es (Spanish), and fr (French). Language is a regular expression, so 'en\|fr' (English or French) and '..' (match any language) are valid (default: 'en').",
101	'type' => "language",
102	'reqd' => "no" } ,
103	{ 'name' => "savephrases",
104	'desc' => "If set, the phrase infomation will be stored in the given file as text. It is probably a good idea to use an absolute path (default: not set).",
105	'type' => "string",
106	'reqd' => "no" } ,
107	{ 'name' => "suffixmode",
108	'desc' => "The smode parameter to the phrase extraction program. A value of 0 means that stopwords are ignored, and of 1 means that stopwords are used (default: 1).",
109	'type' => "int",
110	'reqd' => "no" } ,
111	{ 'name' => "thesaurus",
112	'desc' => "Name of a thesaurus stored in Phind format in the collection's etc directory (default: not set).",
113	'type' => "string",
114	'reqd' => "no" } ,
115	{ 'name' => "untidy",
116	'desc' => "Don't remove working files.",
117	'type' => "flag",
118	'reqd' => "no" }
119	];
120
121	my $options =
122	{ 'name' => "Phind",
123	'desc' => "The Phind clasifier plugin.",
124	'inherits' => "Yes",
125	'args' => $arguments };
126
127	sub print_usage {
128	print STDERR "
129	usage: classify Phind [options]
130
131	options:
132	-text Fields The text used to build the phrase hierarchy.
133	(default: 'section:Title,section:text')
134
135	-title Title The metadata field used to describe each document.
136	(default: 'Title')
137
138	-button Name The label for the classifier screen and button in
139	navigation bar.
140	(default: 'Phrase')
141
142	-language Regex Language or languages to use building hierarchy.
143	Languages are identified by two-letter country codes
144	like en (English), es (Spanish), and fr (French).
145	Language is a regular expression, so 'en\|fr' (English or
146	French) and '..' (match any language) are valid.
147	(default: 'en'.)
148
149	-savephrases File If set, the phrase infomation will be stored in
150	the given file as text. It is probably a good idea
151	to use an absolute path.
152	(default: not set)
153
154	-suffixmode N The smode parameter to the phrase extraction program. A
155	value of 0 means that stopwords are ignored, and of 1
156	means that stopwords are used.
157	(default: 1)
158
159	-thesaurus Name Name of a thesaurus stored in Phind format in the
160	collection's etc directory.
161	(default: not set)
162
163	-untidy Don't remove working files.
164
165	"; }
166
167	# Phrase delimiter symbols - these should be abstracted out someplace
168
169	my $colstart = "COLLECTIONSTART";
170	my $colend = "COLLECTIONEND";
171	my $doclimit = "DOCUMENTLIMIT";
172	my $senlimit = "SENTENCELIMIT";
173	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
174
175
176	# Create a new Phind browser based on collect.cfg
177
178	sub new {
179	my $class = shift (@_);
180	my $self = new BasClas($class, @_);
181
182	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
183	my $option_list = $self->{'option_list'};
184	push( @{$option_list}, $options );
185
186	my $out = $self->{'outhandle'};
187
188	# Ensure the Phind generate scripts are in place
189	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
190	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
191	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
192
193	if (!(-e $file1)) {
194	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
195	exit(1);
196	}
197
198	# Parse classifier arguments
199	my $builddir = "";
200	if (!parsargv::parse(\@_,
201	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
202	q^title/.*/Title^, \$self->{'titlefield'},
203	q^button/.*/Phrase^, \$self->{'buttonname'},
204	q^language/.*/en^, \$language,
205	q^builddir/.*/^, \$builddir,
206	q^savephrases/.*/^, \$self->{'savephrases'},
207	q^suffixmode/\d/1^, \$self->{'suffixmode'},
208	q^thesaurus/.*/^, \$self->{'thesaurus'},
209	q^untidy^, \$self->{'untidy'},
210	"allow_extra_options")) {
211
212	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
213	&print_usage();
214	die "\n";
215	}
216
217	# classifier information
218	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
219
220	# limit languages
221	$self->{'language_exp'} = $language;
222
223	# collection directories
224	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
225	if (!$builddir) {
226	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
227	}
228	$self->{'builddir'} = $builddir;
229
230	$self->{'total'} = 0;
231
232	return bless $self, $class;
233	}
234
235
236	# Initialise the Phind classifier
237
238	sub init {
239	my $self = shift (@_);
240
241	# ensure we have a build directory
242	my $builddir = $self->{'builddir'};
243	die unless (-e "$builddir");
244
245	# create Phind directory
246	my $phnumber = 1;
247	my $phinddir = &util::filename_cat($builddir, "phind1");
248	while (-e "$phinddir") {
249	$phnumber++;
250	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
251	}
252	&util::mk_dir("$phinddir");
253	$self->{'phinddir'} = $phinddir;
254	$self->{'phindnumber'} = $phnumber;
255
256	push(@removedirs, $phinddir) unless $self->{'untidy'};
257
258	# open filehandles for documents and text
259	my $clausefile = &util::filename_cat("$phinddir", "clauses");
260	&util::rm($clausefile) if (-e $clausefile);
261
262	my $txthandle = 'TEXT' . $phnumber;
263	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
264	$self->{'txthandle'} = $txthandle;
265
266	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
267	&util::rm($docfile) if (-e $docfile);
268
269	my $dochandle = 'DOC' . $phnumber;
270	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
271	$self->{'dochandle'} = $dochandle;
272
273	}
274
275
276	# Classify each document.
277	#
278	# Each document is passed here in turn. The classifier extracts the
279	# text of each and stores it in the clauses file. Document details are
280	# stored in the docs.txt file.
281
282	sub classify {
283	my $self = shift (@_);
284	my $doc_obj = shift @_;
285
286	my $verbosity = $self->{'verbosity'};
287	my $top_section = $doc_obj->get_top_section();
288
289	my $titlefield = $self->{'titlefield'};
290
291	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
292	if (!defined($title)) {
293	$title = "";
294	print STDERR "Phind: document has no title\n";
295	}
296	print "process: $title\n" if ($verbosity > 2);
297
298	# Only consider the file if it is in the correct language
299	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
300	my $phrlanguage = $self->{'language_exp'};
301	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
302
303	# record this file
304	$self->{'total'} ++;
305	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
306
307
308	# Store document details
309	my $OID = $doc_obj->get_OID();
310	$OID = "NULL" unless defined $OID;
311	my $dochandle = $self->{'dochandle'};
312	print $dochandle "<Document>\t$OID\t$title\n";
313
314	# Store the text occuring in this object
315
316	# output the document delimiter
317	my $txthandle = $self->{'txthandle'};
318	print $txthandle "$doclimit\n";
319
320	# iterarate over the required indexes and store their text
321	my $indexes = $self->{'indexes'};
322	my $text = "";
323	my ($part, $level, $field, $section, $data, $dataref);
324
325	foreach $part (split(/,/, $indexes)) {
326
327	# Each field has a level and a data element ((e.g. document:Title)
328	($level, $field) = split(/:/, $part);
329	die unless ($level && $field);
330
331	# Extract the text from every section
332	# (In phind, document:text and section:text are equivalent)
333	if ($field eq "text") {
334	$data = "";
335	$section = $doc_obj->get_top_section();
336	while (defined($section)) {
337	$data .= $doc_obj->get_text($section) . "\n";
338	$section = $doc_obj->get_next_section($section);
339	}
340	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
341	}
342
343	# Extract a metadata field from a document
344	# (If there is more than one element of the given type, get them all.)
345	elsif ($level eq "document") {
346	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
347	foreach $data (@$dataref) {
348	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
349	}
350	}
351
352	# Extract metadata from every section in a document
353	elsif ($level eq "section") {
354	$data = "";
355	$section = $doc_obj->get_top_section();
356	while (defined($section)) {
357	$dataref = $doc_obj->get_metadata($section, $field);
358	$data .= join("\n", @$dataref) . "\n";
359	$section = $doc_obj->get_next_section($section);
360	}
361	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
362	}
363
364	# Some sort of specification which I don't understand
365	else {
366	die "Unknown level ($level) in Phind index ($part)\n";
367	}
368
369	}
370
371	# output the text
372	$text =~ tr/\n//s;
373	print $txthandle "$text";
374	}
375
376
377	# Construct the classifier from the information already gathered
378	#
379	# When get_classify_info is called, the clauses and docs.txt files have
380	# already been constructed in the Phind directory. This function will
381	# translate them into compressed, indexed MGPP files that can be read by
382	# the phindcgi script. It will also register our classifier so that it
383	# shows up in the navigation bar.
384
385	sub get_classify_info {
386	my $self = shift (@_);
387
388	close $self->{'dochandle'};
389	close $self->{'txthandle'};
390	my $verbosity = $self->{'verbosity'};
391	my $out = $self->{'outhandle'};
392	my $phinddir = $self->{'phinddir'};
393
394	my $osextra = "";
395	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
396	$osextra = " -d /";
397	}
398
399	if ($verbosity) {
400	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
401	print $out "*** in ", $self->{'phinddir'}, "\n";
402	}
403
404	# Construct phind indexes
405	my $suffixmode = $self->{'suffixmode'};
406	my ($command, $status);
407
408	# Generate the vocabulary, symbol statistics, and numbers file
409	# from the clauses file
410	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
411	&extract_vocabulary($self);
412
413	# Use the suffix program to generate the phind/phrases file
414	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
415	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
416
417	# check that we generated some files. It's not necessarily an error if
418	# we didn't (execute() would have quit on error), but we can't go on.
419	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
420	if (! -r $phrasesfile) {
421	print $out "\nNo phrases found for Phind classifier!\n";
422	return;
423	}
424
425	# Create the phrase file and put phrase numbers in phind/phrases
426	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
427	&renumber_phrases($self);
428
429	print $out "\nCreating phrase databases\n";
430	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
431	my $mg_stem = &util::filename_cat($phinddir, "pdata");
432
433	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
434	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
435	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
436
437	# create the mg index of words
438	print $out "\nCreating word-level search indexes\n";
439	$mg_input = &util::filename_cat($phinddir, "pword.txt");
440	$mg_stem = &util::filename_cat($phinddir, "pword");
441
442	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
443	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
444	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
445	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
446	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
447	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
448
449	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
450	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
451	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
452
453	# create the mg document information database
454	print $out "\nCreating document information databases\n";
455	$mg_input = &util::filename_cat($phinddir, "docs.txt");
456	$mg_stem = &util::filename_cat($phinddir, "docs");
457
458	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
459	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
460	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
461
462	# Return the information about the classifier that we'll later want to
463	# use to create macros when the Phind classifier document is displayed.
464	my %classifyinfo = ('thistype'=>'Invisible',
465	'childtype'=>'Phind',
466	'Title'=>$self->{'buttonname'},
467	'parameters'=>"phindnumber=$self->{'phindnumber'}",
468	'contains'=>[]);
469
470	my $collection = $self->{'collection'};
471	my $url = "library?a=p&p=phind&c=$collection";
472	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
473
474	return \%classifyinfo;
475	}
476
477
478
479	sub convert_gml_to_tokens {
480
481	my ($language_exp, $text) = @_;
482
483	# escape any magic words... - jrm21
484	foreach my $delim (@delimiters) {
485	my $replacement=lc($delim);
486	my $num= $text=~ s/$delim/$replacement/g;
487	if (!$num) {$num=0;}
488	}
489
490	if ($language_exp =~ /en/) {
491	return &convert_gml_to_tokens_EN($text);
492	}
493
494	if ($language_exp =~ /zh/) {
495	return &convert_gml_to_tokens_ZH($text);
496	}
497
498	$_ = $text;
499
500	# 1. remove GML tags
501
502	# Remove everything that is in a tag
503	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
504	s/\s<br>\s/ LINEBREAK /isgo;
505	s/<[^>]*>/ /sgo;
506
507	# Now we have the text, but it may contain HTML
508	# elements coded as > etc. Remove these tags.
509	s/&/&/sgo;
510	s/</</sgo;
511	s/>/>/sgo;
512	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
513	s/\s<br>\s/ LINEBREAK /isgo;
514	s/<[^>]*>/ /sgo;
515
516	# replace<p> and <br> placeholders with clause break symbol (\n)
517	s/\s+/ /gso;
518	s/PARAGRAPHBREAK/\n/sgo;
519	s/LINEBREAK/\n/sgo;
520
521
522
523
524	# 2. Split the remaining text into space-delimited tokens
525
526	# Convert any HTML special characters (like ") to their UTF8 equivalent
527	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
528
529	# Split text at word boundaries
530	s/\b/ /go;
531
532	# 3. Convert the remaining text to "clause format"
533
534	# Insert newline if the end of a sentence is detected
535	# (delimter is: "[\.\?\!]\s")
536	# s/\s*[\.\?\!]\s+/\n/go;
537
538	# remove unnecessary punctuation and replace with clause break symbol (\n)
539	s/[^\w ]/\n/go;
540
541	# remove extraneous whitespace
542	s/ +/ /sgo;
543	s/^\s+//mgo;
544	s/\s*$/\n/mgo;
545
546	# remove lines that contain one word or less
547	s/^\S*$//mgo;
548	s/^\s*$//mgo;
549	tr/\n//s;
550
551	return $_;
552	}
553
554	# a chinese version
555	sub convert_gml_to_tokens_ZH {
556
557	$_ = shift @_;
558
559	# Replace all whitespace with a simple space
560	s/\s+/ /gs;
561	# Remove everything that is in a tag
562	s/\s<p>\s/ PARAGRAPHBREAK /isg;
563	s/\s<br>\s/ LINEBREAK /isg;
564	s/<[^>]*>/ /sg;
565
566	# Now we have the text, but it may contain HTML
567	# elements coded as > etc. Remove these tags.
568	s/</</sg;
569	s/>/>/sg;
570
571	s/\s+/ /sg;
572	s/\s<p>\s/ PARAGRAPHBREAK /isg;
573	s/\s<br>\s/ LINEBREAK /isg;
574	s/<[^>]*>/ /sg;
575
576	# remove & and other miscellaneous markup tags
577	s/&/&/sg;
578	s/</</sg;
579	s/>/>/sg;
580	s/&/&/sg;
581
582	# replace<p> and <br> placeholders with carriage returns
583	s/PARAGRAPHBREAK/\n/sg;
584	s/LINEBREAK/\n/sg;
585
586
587	# print STDERR "text:$_\n";
588	return $_;
589	}
590
591	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
592
593	sub convert_gml_to_tokens_EN {
594	$_ = shift @_;
595
596	# FIRST, remove GML tags
597
598	# Replace all whitespace with a simple space
599	s/\s+/ /gs;
600
601	# Remove everything that is in a tag
602	s/\s<p>\s/ PARAGRAPHBREAK /isg;
603	s/\s<br>\s/ LINEBREAK /isg;
604	s/<[^>]*>/ /sg;
605
606	# Now we have the text, but it may contain HTML
607	# elements coded as > etc. Remove these tags.
608	s/</</sg;
609	s/>/>/sg;
610
611	s/\s+/ /sg;
612	s/\s<p>\s/ PARAGRAPHBREAK /isg;
613	s/\s<br>\s/ LINEBREAK /isg;
614	s/<[^>]*>/ /sg;
615
616	# remove & and other miscellaneous markup tags
617	s/&/&/sg;
618	s/</</sg;
619	s/>/>/sg;
620	s/&/&/sg;
621
622	# replace<p> and <br> placeholders with carriage returns
623	s/PARAGRAPHBREAK/\n/sg;
624	s/LINEBREAK/\n/sg;
625
626
627	# Exceptional punctuation
628	#
629	# We make special cases of some punctuation
630
631	# remove any apostrophe that indicates omitted letters
632	s/(\w+)\'(\w*\s)/ $1$2 /g;
633
634	# remove period that appears in a person's initals
635	s/\s([A-Z])\./ $1 /g;
636
637	# replace hyphens in hypheanted words and names with a space
638	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
639
640	# Convert the remaining text to "clause format",
641	# This means removing all excess punctuation and garbage text,
642	# normalising valid punctuation to fullstops and commas,
643	# then putting one cluse on each line.
644
645	# Insert newline when the end of a sentence is detected
646	# (delimter is: "[\.\?\!]\s")
647	s/\s*[\.\?\!]\s+/\n/g;
648
649	# split numbers after four digits
650	s/(\d\d\d\d)/$1 /g;
651
652	# split words after 32 characters
653
654	# squash repeated punctuation
655	tr/A-Za-z0-9 //cs;
656
657	# save email addresses
658	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
659
660	# normalise clause breaks (mostly punctuation symbols) to commas
661	s/[^A-Za-z0-9 \n]+/ , /g;
662
663	# Remove repeated commas, and replace with newline
664	s/\s*,[, ]+/\n/g;
665
666	# remove extra whitespace
667	s/ +/ /sg;
668	s/^\s+//mg;
669	s/\s*$/\n/mg;
670
671	# remove lines that contain one word or less
672	s/^\w*$//mg;
673	s/^\s*$//mg;
674	tr/\n//s;
675
676	return $_;
677
678	}
679
680
681
682	# Execute a system command
683
684	sub execute {
685	my ($command, $verbosity, $outhandle) = @_;
686	print $outhandle "Executing: $command\n" if ($verbosity > 2);
687	$! = 0;
688	my $status = system($command);
689	if ($status != 0) {
690	print STDERR "Phind - Error executing '$command': $!\n";
691	exit($status); # this causes the build to fail...
692	}
693	}
694
695
696	# Generate the vocabulary, symbol statistics, and numbers file from the
697	# clauses file. This is legacy code, so is a bit messy and probably wont
698	# run under windows.
699
700	sub extract_vocabulary {
701	my ($self) = @_;
702
703	my $verbosity = $self->{'verbosity'};
704	my $out = $self->{'outhandle'};
705
706	my $collectiondir = $self->{'collectiondir'};
707	my $phinddir = $self->{'phinddir'};
708
709	my $language_exp = $self->{'language_exp'};
710
711	my ($w, $l, $line, $word);
712
713	my ($first_delimiter, $last_delimiter,
714	$first_stopword, $last_stopword,
715	$first_extractword, $last_extractword,
716	$first_contentword, $last_contentword,
717	$phrasedelimiter);
718
719	my $thesaurus = $self->{'thesaurus'};
720	my ($thesaurus_links, $thesaurus_terms,
721	%thesaurus, $first_thesaurusword, $last_thesaurusword);
722
723	my %symbol;
724	my (%freq);
725
726	print $out "Calculating vocabulary\n" if ($verbosity > 1);
727
728	# Read and store the stopwords
729	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
730	my $stopword_files = ();
731	my ($language, $language_dir, $file, $file_name);
732	my %stopwords;
733
734	# Examine each directory in the stopword directory
735	opendir(STOPDIR, $stopdir);
736	foreach $language (readdir STOPDIR) {
737
738	# Ignore entries that do not match the classifier's language
739	next unless ($language =~ /$language_exp/);
740	$language_dir = &util::filename_cat($stopdir, $language);
741	next unless (-d "$language_dir");
742
743	opendir(LANGDIR, $language_dir);
744	foreach $file (readdir LANGDIR) {
745
746	# Ignore entries that are not stopword files
747	next unless ($file =~ /sw$/);
748	$file_name = &util::filename_cat($language_dir, $file);
749	next unless (-f "$file_name");
750
751	# Read the stopwords
752	open(STOPFILE, "<$file_name");
753	while (<STOPFILE>) {
754	s/^\s+//;
755	s/\s.*//;
756	$word = $_;
757	$l = lc($word);
758	$stopwords{$l} = $word;
759	}
760	close STOPFILE;
761
762	}
763	closedir LANGDIR;
764	}
765	closedir STOPDIR;
766
767	# Read thesaurus information
768	if ($thesaurus) {
769
770	# link file exists
771	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
772	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
773
774	# ensure term file exists in the correct language
775	if ($language_exp =~ /^([a-z][a-z])/) {
776	$language = $1;
777	} else {
778	$language = 'en';
779	}
780	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
781	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
782
783
784	# Read the thesaurus terms
785	open(TH, "<$thesaurus_terms");
786	while(<TH>) {
787	s/^\d+ //;
788	s/\(.*\)//;
789	foreach $w (split(/\s+/, $_)) {
790	$thesaurus{lc($w)} = $w;
791	}
792	}
793	close TH;
794	}
795
796	# Read words in the text and count occurences
797	open(TXT, "<$phinddir/clauses");
798
799	my @words;
800	while(<TXT>) {
801	$line = $_;
802	next unless ($line =~ /./);
803
804	@words = split(/\s+/, $line);
805	foreach $w (@words) {
806	$l = lc($w);
807	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
808	$freq{$w}++;
809	}
810	$freq{$senlimit}++;
811	}
812
813	close TXT;
814
815	# Calculate the "best" form of each word
816	my (%bestform, %totalfreq, %bestfreq);
817
818	foreach $w (sort (keys %freq)) {
819	$l = lc($w);
820
821	# totalfreq is the number of times a term appears in any form
822	$totalfreq{$l} += $freq{$w};
823
824	if (defined $stopwords{$l}) {
825	$bestform{$l} = $stopwords{$l};
826
827	} elsif (defined $thesaurus{$l}) {
828	$bestform{$l} = $thesaurus{$l};
829
830	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
831	$bestfreq{$l} = $freq{$w};
832	$bestform{$l} = $w;
833	}
834	}
835	undef %freq;
836	undef %bestfreq;
837
838
839	# Assign symbol numbers to tokens
840	my $nextsymbol = 1;
841	my (@vocab);
842
843	# Delimiters
844	$first_delimiter = 1;
845
846	foreach $word (@delimiters) {
847
848	# $word = lc($word); # jrm21
849	$word = uc($word);
850	$bestform{$word} = $word;
851	$vocab[$nextsymbol] = $word;
852	$symbol{$word} = $nextsymbol;
853	$nextsymbol++;
854	}
855	$last_delimiter = $nextsymbol - 1;
856	# Stopwords
857	$first_stopword = $nextsymbol;
858
859	foreach my $word (sort keys %stopwords) {
860	# don't include stopword unless it occurs in the text
861	$word = lc($word);
862	next unless ($totalfreq{$word});
863	next if ($symbol{$word});
864
865	$vocab[$nextsymbol] = $word;
866	$symbol{$word} = $nextsymbol;
867	$nextsymbol++;
868	}
869	$last_stopword = $nextsymbol - 1;
870	$first_contentword = $nextsymbol;
871
872	# Thesaurus terms
873	if ($thesaurus) {
874	$first_thesaurusword = $nextsymbol;
875
876	foreach my $word (sort keys %thesaurus) {
877
878	$word = lc($word);
879	next if ($symbol{$word});
880	$bestform{$word} = $thesaurus{$word};
881
882	$vocab[$nextsymbol] = $word;
883	$symbol{$word} = $nextsymbol;
884	$nextsymbol++;
885
886	}
887	$last_thesaurusword = $nextsymbol - 1;
888	}
889
890	# Other content words
891	$first_extractword = $nextsymbol;
892
893	foreach my $word (sort (keys %bestform)) {
894
895	next if ($symbol{$word});
896
897	$vocab[$nextsymbol] = $word;
898	$symbol{$word} = $nextsymbol;
899	$nextsymbol++;
900	}
901	$last_extractword = $nextsymbol - 1;
902	$last_contentword = $nextsymbol - 1;
903
904	# Outut the words
905	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
906	open(VOC, ">$phinddir/clauses.vocab");
907
908	for (my $i = 1; $i < $nextsymbol; $i++) {
909	$w = $vocab[$i];
910
911	print VOC "$bestform{$w}\n";
912	$totalfreq{$w} = 0 unless ($totalfreq{$w});
913	}
914	close VOC;
915
916
917	# Create statistics file
918	# Output statistics about the vocablary
919	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
920	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
921
922	open(STAT, ">$phinddir/clauses.stats")
923	\|\| die "Cannot open $phinddir/clauses.stats: $!";
924
925	print STAT "first_delimiter $first_delimiter\n";
926	print STAT "last_delimiter $last_delimiter\n";
927	print STAT "first_stopword $first_stopword\n";
928	print STAT "last_stopword $last_stopword\n";
929	if ($thesaurus) {
930	print STAT "first_thesaurusword $first_thesaurusword\n";
931	print STAT "last_thesaurusword $last_thesaurusword\n";
932	}
933	print STAT "first_extractword $first_extractword\n";
934	print STAT "last_extractword $last_extractword\n";
935	print STAT "first_contentword $first_contentword\n";
936	print STAT "last_contentword $last_contentword\n";
937	print STAT "first_symbol $first_delimiter\n";
938	print STAT "last_symbol $last_contentword\n";
939	print STAT "first_word $first_stopword\n";
940	print STAT "last_word $last_contentword\n";
941	close STAT;
942
943	undef @vocab;
944
945
946	# Create numbers file
947	# Save text as symbol numbers
948	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
949
950	open(TXT, "<$phinddir/clauses");
951	open(NUM, ">$phinddir/clauses.numbers");
952
953	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
954	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
955	$phrasedelimiter = $symbol{$senlimit};
956	print NUM "$symbol{$colstart}\n";
957
958	# set up the special symbols that delimit documents and sentences
959	while(<TXT>) {
960
961	# split sentence into a list of tokens
962	$line = $_;
963	next unless ($line =~ /./);
964	@words = split(/\s+/, $line);
965
966	# output one token at a time
967	foreach $word (@words) {
968	# don't lower-case special delimiters - jrm21
969	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
970	$word = lc($word);
971	}
972	print NUM "$symbol{$word}\n";
973	}
974
975	# output phrase delimiter
976	print NUM "$phrasedelimiter\n";
977	}
978
979	close TXT;
980	# print NUM "$symbol{lc($colend)}\n";# jrm21
981	print NUM "$symbol{$colend}\n";
982	close NUM;
983
984	# Save thesaurus data in one convienient file
985	if ($thesaurus) {
986
987	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
988
989
990	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
991	if ($verbosity > 1);
992
993	# Read the thesaurus terms
994	my ($num, $text, %thes_symbols);
995
996	open(TH, "<$thesaurus_terms");
997	while(<TH>) {
998	chomp;
999	@words = split(/\s+/, $_);
1000	$num = shift @words;
1001	$text = "";
1002
1003	# translate words into symbol numbers
1004	foreach $word (@words) {
1005	$word = lc($word);
1006	if ($symbol{$word}) {
1007	$text .= "s$symbol{$word} ";
1008	} elsif ($verbosity) {
1009	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1010	}
1011	}
1012	$text =~ s/ $//;
1013	$thes_symbols{$num} = $text;
1014	}
1015	close TH;
1016
1017	# Read the thesaurus links and write the corresponding data
1018	open(TH, "<$thesaurus_links");
1019	open(THOUT, ">$thesaurusfile");
1020
1021	while(<TH>) {
1022	chomp;
1023	($num, $text) = split(/:/, $_);
1024
1025	if (defined($thes_symbols{$num})) {
1026	print THOUT "$num:$thes_symbols{$num}:$text\n";
1027	} else {
1028	print THOUT "$num:untranslated:$text\n";
1029	}
1030	}
1031	close TH;
1032	close THOUT;
1033	}
1034
1035
1036
1037
1038	}
1039
1040
1041	# renumber_phrases
1042	#
1043	# Prepare the phrases file to be input to mgpp. The biggest problem is
1044	# reconciling the phrase identifiers used by the suffix program (which
1045	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1046	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1047	# Phind-id numbers must be sorted by frequency of occurance.
1048	#
1049	# Start creating a set of phind-id numbers from the sorted suffix-id
1050	# numbers and (if required) the thesaurus-id numbers. Then add any other
1051	# phrases occuring in the thesaurus.
1052	#
1053	# The last thing we have to do is restore the vocabulary information to the
1054	# phrase file so that the phrases are stored as words, not as symbol
1055	# numbers.
1056
1057	# The original phrases file looks something like this:
1058	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1059	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1060	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1061	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1062
1063
1064	sub renumber_phrases {
1065	my ($self) = @_;
1066
1067	renumber_suffix_data($self);
1068	renumber_thesaurus_data($self);
1069	restore_vocabulary_data($self);
1070
1071	}
1072
1073
1074
1075	# renumber_suffix_data
1076	#
1077	# Translate phrases file to phrases.2 using phind keys instead
1078	# of suffix keys and sorting the expansion data.
1079
1080	sub renumber_suffix_data {
1081	my ($self) = @_;
1082
1083	my $verbosity = $self->{'verbosity'};
1084	my $out = $self->{'outhandle'};
1085	print $out "Translate phrases: suffix-ids become phind-id's\n"
1086	if ($verbosity);
1087
1088	my $phinddir = $self->{'phinddir'};
1089	my $infile = &util::filename_cat($phinddir, 'phrases');
1090	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1091
1092	# Read the phrase file. Calculate initial set of phind-id
1093	# numbers and store (suffixid -> frequency) relation.
1094
1095	my %suffixtophind;
1096	my @phindfrequency;
1097	my (@fields, $suffixid);
1098	my $nextphind = 1;
1099
1100	open(IN, "<$infile");
1101	while(<IN>) {
1102
1103	chomp;
1104	@fields = split(/:/, $_);
1105
1106	# get next suffixid and phindid
1107	$suffixid = shift @fields;
1108	$suffixtophind{$suffixid} = $nextphind;
1109
1110	# store total frequency
1111	shift @fields;
1112	$totalfrequency[$nextphind] = shift @fields;
1113
1114	$nextphind++;
1115	}
1116	close IN;
1117
1118
1119	# Translate phrases file to phrases.2. Use phind keys (not suffix
1120	# keys), sort expansion and document occurance data in order of
1121	# descending frequency..
1122	open(IN, "<$infile");
1123	open(OUT, ">$outfile");
1124
1125	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1126	my (@documwents, @newexp, $k, $n);
1127	my $linenumber = 0;
1128
1129	while(<IN>) {
1130
1131	# read the line
1132	chomp;
1133	@fields = split(/:/, $_);
1134
1135	# get a phrase number for this line
1136	$suffixid = shift @fields;
1137	die unless (defined($suffixtophind{$suffixid}));
1138	$phindid = $suffixtophind{$suffixid};
1139
1140	# get the symbols in the phrase
1141	$text = shift @fields;
1142
1143	# output status information
1144	$linenumber++;
1145	if ($verbosity > 2) {
1146	if ($linenumber % 1000 == 0) {
1147	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1148	}
1149	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1150	}
1151
1152	# get the phrase frequency
1153	$tf = shift @fields;
1154
1155	# get the number of expansions
1156	$countexp = shift @fields;
1157
1158	# get the expansions, convert them into phind-id numbers, and sort them
1159	$expansions = shift @fields;
1160	@newexp = ();
1161	foreach $k (split(/,/, $expansions)) {
1162	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1163	$n = $suffixtophind{$k};
1164	push @newexp, $n;
1165	}
1166	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1167
1168	# get the number of documents
1169	$countdocs = shift @fields;
1170
1171	# get the documents and sort them
1172	$documents = shift @fields;
1173	$documents =~ s/d//g;
1174	@documents = split(/;/, $documents);
1175	@documents = sort by_doc_frequency @documents;
1176
1177	# output the phrase data
1178	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1179	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1180
1181	}
1182
1183	close IN;
1184	close OUT;
1185	}
1186
1187
1188	# renumber_thesaurus_data
1189	#
1190	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1191
1192	sub renumber_thesaurus_data {
1193	my ($self) = @_;
1194
1195	my $out = $self->{'outhandle'};
1196	my $verbosity = $self->{'verbosity'};
1197	my $thesaurus = $self->{'thesaurus'};
1198
1199	my $phinddir = $self->{'phinddir'};
1200	my $infile = &util::filename_cat($phinddir, "phrases.2");
1201	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1202
1203
1204	# If no thesaurus is defined, simply move the phrases file.
1205	if (!$thesaurus) {
1206	print $out "Translate phrases.2: no thesaurus data\n"
1207	if ($verbosity);
1208	&util::mv($infile, $outfile);
1209	return;
1210	}
1211
1212	print $out "Translate phrases.2: add thesaurus data\n"
1213	if ($verbosity);
1214
1215	# 1.
1216	# Read thesaurus file and store (symbols->thesaurusid) mapping
1217	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1218	my %symbolstothesid;
1219	my (@fields, $thesid, $symbols);
1220
1221	open(TH, "<$thesaurusfile");
1222
1223	while (<TH>) {
1224
1225	chomp;
1226	@fields = split(/:/, $_);
1227
1228	# get id and text
1229	$thesid = shift @fields;
1230	$symbols = shift @fields;
1231	$symbolstothesid{$symbols} = $thesid;
1232	}
1233	close TH;
1234
1235	# 2.
1236	# Read phrases file to find thesaurus entries that already
1237	# have a phindid. Store their phind-ids for later translation,
1238	# and store their frequency for later sorting.
1239	my %thesaurustophindid;
1240	my %phindidtofrequency;
1241	my ($phindid, $freq);
1242
1243	open(IN, "<$infile");
1244
1245	while(<IN>) {
1246
1247	chomp;
1248	@fields = split(/:/, $_);
1249
1250	# phindid and symbols for this line
1251	$phindid = shift @fields;
1252	$symbols = shift @fields;
1253	$freq = shift @fields;
1254
1255	# do we have a thesaurus id corresponding to this phrase?
1256	if (defined($symbolstothesid{$symbols})) {
1257	$thesid = $symbolstothesid{$symbols};
1258	$thesaurustophindid{$thesid} = $phindid;
1259	$phindidtofrequency{$phindid} = $freq;
1260	}
1261	}
1262	close IN;
1263
1264	undef %symbolstothesid;
1265
1266	# 3.
1267	# Create phind-id numbers for remaining thesaurus entries,
1268	# and note that their frequency is 0 for later sorting.
1269	my $nextphindid = $phindid + 1;
1270
1271	open(TH, "<$thesaurusfile");
1272	while(<TH>) {
1273
1274	chomp;
1275	@fields = split(/:/, $_);
1276
1277	# read thesaurus-id and ensure it has a corresponding phind-id
1278	$thesid = shift @fields;
1279	if (!defined($thesaurustophindid{$thesid})) {
1280	$thesaurustophindid{$thesid} = $nextphindid;
1281	$phindidtofrequency{$nextphindid} = 0;
1282	$nextphindid++;
1283	}
1284	}
1285	close TH;
1286
1287	# 4.
1288	# Translate thesaurus file, replacing thesaurus-id numbers with
1289	# phind-id numbers.
1290	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1291	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1292	my (@links, $linkid, %linkidtotype, $newrelation);
1293
1294	open(TH, "<$thesaurusfile");
1295	open(TO, ">$newthesaurusfile");
1296	while(<TH>) {
1297
1298	chomp;
1299	@fields = split(/:/, $_);
1300
1301	# phindid and symbols for this line
1302	($thesid, $symbols, $relations) = @fields;
1303
1304	die unless ($thesid && $symbols);
1305	die unless $thesaurustophindid{$thesid};
1306	$phindid = $thesaurustophindid{$thesid};
1307
1308	# convert each part of the relation string to use phind-id numbers
1309	# at the same time, we want to sort the list by frequency.
1310	undef %linkidtotype;
1311
1312	foreach $linktext (split(/;/, $relations)) {
1313	@linkdata = split(/,/, $linktext);
1314
1315	# remember the linktype (e.g. BT, NT)
1316	$linktype = shift @linkdata;
1317
1318	# store the type of each link
1319	foreach $thesid (@linkdata) {
1320	die unless (defined($thesaurustophindid{$thesid}));
1321	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1322	}
1323	}
1324
1325	# sort the list of links, first by frequency, then by type.
1326	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1327	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1328	$linkcounter = (scalar @links);
1329
1330	# create a string describing the link information
1331	$linktype = $linkidtotype{$links[0]};
1332	$newrelation = $linktype;
1333	foreach $linkid (@links) {
1334	if ($linkidtotype{$linkid} ne $linktype) {
1335	$linktype = $linkidtotype{$linkid};
1336	$newrelation .= ";" . $linktype;
1337	}
1338	$newrelation .= "," . $linkid;
1339	}
1340	$newrelation .= ";";
1341
1342
1343	# output the new line
1344	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1345	}
1346	close TH;
1347	close TO;
1348
1349	undef %thesaurustophindid;
1350	undef %linkidtotype;
1351	undef %phindidtofrequency;
1352
1353	# 5.
1354	# Read thesaurus data (in phind-id format) into memory
1355	my %thesaurusdata;
1356
1357	open(TH, "<$newthesaurusfile");
1358	while(<TH>) {
1359	chomp;
1360	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1361	die unless ($phindid && $symbols);
1362	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1363	}
1364	close TH;
1365
1366	# 6.
1367	# Add thesaurus data to phrases file
1368	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1369	my (@documwents, @newexp, $k, $n);
1370	my $linenumber = 0;
1371
1372	open(IN, "<$infile");
1373	open(OUT, ">$outfile");
1374
1375	# Update existing phrases
1376	while(<IN>) {
1377
1378	chomp;
1379	@fields = split(/:/, $_);
1380
1381	# get data for this line
1382	$phindid = shift @fields;
1383
1384	# output the phrase data, with thesaurus information
1385	print OUT "$phindid:", join(":", @fields);
1386
1387	# add thesaurus data
1388	if (defined($thesaurusdata{$phindid})) {
1389	@fields = split(/:/, $thesaurusdata{$phindid});
1390	shift @fields;
1391	$linkcounter = shift @fields;
1392	$relations = shift @fields;
1393
1394	print OUT ":$linkcounter:$relations";
1395	$thesaurusdata{$phindid} = "";
1396	}
1397	print OUT "\n";
1398	}
1399	close IN;
1400
1401	# Add phrases that aren't already in the file
1402	foreach $phindid (sort numerically keys %thesaurusdata) {
1403	next unless ($thesaurusdata{$phindid});
1404
1405	@fields = split(/:/, $thesaurusdata{$phindid});
1406	$symbols = shift @fields;
1407	$linkcounter = shift @fields;
1408	$relations = shift @fields;
1409
1410	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1411	}
1412	close OUT;
1413
1414	}
1415
1416	# restore_vocabulary_data
1417	#
1418	# Read phrases.3 and restore vocabulary information. Then write
1419	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1420	# (if requested) to the saved phrases file.
1421
1422	sub restore_vocabulary_data {
1423	my ($self) = @_;
1424
1425	my $out = $self->{'outhandle'};
1426	my $verbosity = $self->{'verbosity'};
1427	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1428
1429	my $phinddir = $self->{'phinddir'};
1430	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1431	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1432	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1433	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1434
1435	my $savephrases = $self->{'savephrases'};
1436
1437	# 1.
1438	# Read the vocabulary file
1439	open(V, "<$vocabfile")
1440	\|\| die "Cannot open $vocabfile: $!";
1441	my @symbol;
1442	my $i = 1;
1443	while(<V>) {
1444	chomp;
1445	$symbol[$i++] = $_;
1446	}
1447	close V;
1448
1449	# 2.
1450	# Translate phrases.3 to MGPP input files
1451	my ($key, $text, $word, $isThesaurus);
1452	my @fields;
1453	my $linenumber = 0;
1454
1455	open(IN, "<$infile");
1456	open(DATA, ">$datafile");
1457	open(WORD, ">$wordfile");
1458
1459	# Save the phrases in a separate text file
1460	if ($savephrases) {
1461	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1462	open(SAVE, ">$savephrases");
1463	}
1464
1465	while(<IN>) {
1466
1467	# read the line
1468	chomp;
1469	$line = $_;
1470	@fields = split(/:/, $line);
1471
1472	# get a phrase number for this line
1473	$key = shift @fields;
1474
1475	# restore the text of the phrase
1476	$text = shift @fields;
1477	$text =~ s/s(\d+)/$symbol[$1]/g;
1478	if ($text =~ / /) {
1479	$word = "";
1480	} elsif ($text ne 'untranslated') {
1481	$word = $text;
1482	}
1483
1484	# output the phrase data
1485	print DATA "<Document>";
1486	print DATA "$key:$text:", join(":", @fields), ":\n";
1487
1488	# output the word index search data
1489	print WORD "<Document>$word\n";
1490
1491	# output the phrases to a text file
1492	if ($savephrases) {
1493	if ((scalar @fields) == 7) {
1494	$isThesaurus = 1;
1495	} else {
1496	$isThesaurus = 0;
1497	}
1498	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1499	}
1500	}
1501	close IN;
1502	close WORD;
1503	close DATA;
1504	close SAVE if ($savephrases);
1505
1506	}
1507
1508
1509
1510	# sort routines used to renumber phrases
1511
1512	sub numerically { $a <=> $b }
1513
1514	sub by_doc_frequency {
1515	my $fa = 1;
1516	if ($a =~ /,/) {
1517	$fa = $a;
1518	$fa =~ s/\d+,//;
1519	}
1520	my $fb = 1;
1521	if ($b =~ /,/) {
1522	$fb = $b;
1523	$fb =~ s/\d+,//;
1524	}
1525
1526	return ($fb <=> $fa);
1527	}
1528
1529	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: