Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 27904

Last change on this file since 27904 was 27904, checked in by ak19, 11 years ago
Replacing calls to deprecated utils:: subroutines with FileUtils:: equivalents.
Property svn:keywords set to `Author Date Id Revision`
File size: 41.3 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	package Phind;
33
34	use BaseClassifier;
35	use FileUtils;
36	use util;
37	use ghtml;
38	use unicode;
39
40	use strict;
41	no strict 'refs'; # allow filehandles to be variables and viceversa
42
43	my @removedirs = ();
44
45	my %wanted_index_files = ('td'=>1,
46	't'=>1,
47	'ti'=>1,
48	'tl'=>1,
49	'tsd'=>1,
50	'idb'=>1,
51	'ib1'=>1,
52	'ib2'=>1,
53	'ib3'=>1,
54	'i'=>1,
55	'il'=>1,
56	'w'=>1,
57	'wa'=>1);
58
59	sub BEGIN {
60	@Phind::ISA = ('BaseClassifier');
61	}
62
63	sub END {
64
65	# Tidy up stray files - we do this here as there's some weird problem
66	# preventing us from doing it in the get_classify_info() function (on
67	# windows at least) where the close() appears to fail on txthandle and
68	# dochandle, thus preventing us from deleting those files
69
70	foreach my $dir (@removedirs) {
71	if (-d $dir && opendir (DIR, $dir)) {
72	my @files = readdir DIR;
73	closedir DIR;
74
75	foreach my $file (@files) {
76	next if $file =~ /^\.\.?$/;
77	my ($suffix) = $file =~ /\.([^\.]+)$/;
78	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
79	# delete it!
80	&FileUtils::removeFiles (&FileUtils::filenameConcatenate ($dir, $file));
81	}
82	}
83	}
84	}
85	}
86
87	my $arguments =
88	[ { 'name' => "text",
89	'desc' => "{Phind.text}",
90	'type' => "string",
91	'deft' => "section:Title,section:text",
92	'reqd' => "no" },
93	{ 'name' => "title",
94	'desc' => "{Phind.title}",
95	'type' => "metadata",
96	'deft' => "Title",
97	'reqd' => "no" },
98	{ 'name' => "buttonname",
99	'desc' => "{BasClas.buttonname}",
100	'type' => "string",
101	'deft' => "Phrase",
102	'reqd' => "no" },
103	{ 'name' => "language",
104	'desc' => "{Phind.language}",
105	'type' => "string",
106	'deft' => "en",
107	'reqd' => "no" },
108	{ 'name' => "savephrases",
109	'desc' => "{Phind.savephrases}",
110	'type' => "string",
111	'deft' => "",
112	'reqd' => "no" },
113	{ 'name' => "suffixmode",
114	'desc' => "{Phind.suffixmode}",
115	'type' => "int",
116	'deft' => "1",
117	'range' => "0,1",
118	'reqd' => "no" },
119	{ 'name' => "min_occurs",
120	'desc' => "{Phind.min_occurs}",
121	'type' => "int",
122	'deft' => "2",
123	'range' => "1,",
124	'reqd' => "no" },
125	{ 'name' => "thesaurus",
126	'desc' => "{Phind.thesaurus}",
127	'type' => "string",
128	'deft' => "",
129	'reqd' => "no" },
130	{ 'name' => "untidy",
131	'desc' => "{Phind.untidy}",
132	'type' => "flag",
133	'reqd' => "no" } ];
134
135	my $options = { 'name' => "Phind",
136	'desc' => "{Phind.desc}",
137	'abstract' => "no",
138	'inherits' => "yes",
139	'args' => $arguments };
140
141
142	# Phrase delimiter symbols - these should be abstracted out someplace
143
144	my $colstart = "COLLECTIONSTART";
145	my $colend = "COLLECTIONEND";
146	my $doclimit = "DOCUMENTLIMIT";
147	my $senlimit = "SENTENCELIMIT";
148	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
149
150
151	# Create a new Phind browser based on collect.cfg
152
153	sub new {
154	my ($class) = shift (@_);
155	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
156	push(@$classifierslist, $class);
157
158	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
159	push(@{$hashArgOptLists->{"OptList"}},$options);
160
161	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
162
163	if ($self->{'info_only'}) {
164	# don't worry about any options etc
165	return bless $self, $class;
166	}
167
168	# Ensure the Phind generate scripts are in place
169	my $file1 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
170	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
171	my $src = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "src", "phind", "generate");
172	if (!(-e $file1)) {
173	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
174	exit(1);
175	}
176
177	# things that may have ex. in them that need to be stripped off
178	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
179	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
180
181	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
182
183	$self->{"indexes"} = $self->{"text"};
184
185	# Further setup
186	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
187	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
188	if (! defined $self->{'builddir'}) {
189	$self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "building");
190	}
191	$self->{'total'} = 0;
192
193	# we set phind to be rtl if language is arabic
194	if ($self->{'language'} eq "ar") {
195	$self->{'textorientation'} = "rtl";
196	}
197	# Clean out the unused keys
198	delete $self->{"text"};
199
200	return bless $self, $class;
201	}
202
203
204	# Initialise the Phind classifier
205
206	sub init {
207	my $self = shift (@_);
208
209	# ensure we have a build directory
210	my $builddir = $self->{'builddir'};
211	die unless (-e "$builddir");
212
213	# create Phind directory
214	my $phnumber = 1;
215	my $phinddir = &FileUtils::filenameConcatenate($builddir, "phind1");
216	while (-e "$phinddir") {
217	$phnumber++;
218	$phinddir = &FileUtils::filenameConcatenate($builddir, "phind$phnumber");
219	}
220	&util::mk_dir("$phinddir");
221	$self->{'phinddir'} = $phinddir;
222	$self->{'phindnumber'} = $phnumber;
223
224	push(@removedirs, $phinddir) unless $self->{'untidy'};
225
226	# open filehandles for documents and text
227	my $clausefile = &FileUtils::filenameConcatenate("$phinddir", "clauses");
228	&FileUtils::removeFiles($clausefile) if (-e $clausefile);
229
230	my $txthandle = 'TEXT' . $phnumber;
231	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
232	$self->{'txthandle'} = $txthandle;
233
234	my $docfile = &FileUtils::filenameConcatenate("$phinddir", "docs.txt");
235	&FileUtils::removeFiles($docfile) if (-e $docfile);
236
237	my $dochandle = 'DOC' . $phnumber;
238	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
239	$self->{'dochandle'} = $dochandle;
240
241	}
242
243
244	# Classify each document.
245	#
246	# Each document is passed here in turn. The classifier extracts the
247	# text of each and stores it in the clauses file. Document details are
248	# stored in the docs.txt file.
249
250	sub classify {
251	my $self = shift (@_);
252	my ($doc_obj) = @_;
253
254	my $verbosity = $self->{'verbosity'};
255	my $top_section = $doc_obj->get_top_section();
256
257	my $titlefield = $self->{'title'};
258
259	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
260	if (!defined($title)) {
261	$title = "";
262	print STDERR "Phind: document has no title\n";
263	}
264	print "process: $title\n" if ($verbosity > 2);
265
266	# Only consider the file if it is in the correct language
267	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
268	my $phrlanguage = $self->{'language'};
269	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
270
271	# record this file
272	$self->{'total'} ++;
273	# what is $file ???
274	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
275
276	# Store document details
277	my $OID = $doc_obj->get_OID();
278	$OID = "NULL" unless defined $OID;
279	my $dochandle = $self->{'dochandle'};
280	print $dochandle "<Document>\t$OID\t$title\n";
281
282	# Store the text occuring in this object
283
284	# output the document delimiter
285	my $txthandle = $self->{'txthandle'};
286	print $txthandle "$doclimit\n";
287
288	# iterate over the required indexes and store their text
289	my $indexes = $self->{'indexes'};
290	my $text = "";
291	my ($part, $level, $field, $section, $data, $dataref);
292
293	foreach $part (split(/,/, $indexes)) {
294
295	# Each field has a level and a data element ((e.g. document:Title)
296	($level, $field) = split(/:/, $part);
297	die unless ($level && $field);
298
299	# Extract the text from every section
300	# (In phind, document:text and section:text are equivalent)
301	if ($field eq "text") {
302	$data = "";
303	$section = $doc_obj->get_top_section();
304	while (defined($section)) {
305	$data .= $doc_obj->get_text($section) . "\n";
306	$section = $doc_obj->get_next_section($section);
307	}
308	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
309	}
310
311	# Extract a metadata field from a document
312	# (If there is more than one element of the given type, get them all.)
313	elsif ($level eq "document") {
314	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
315	foreach $data (@$dataref) {
316	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
317	}
318	}
319
320	# Extract metadata from every section in a document
321	elsif ($level eq "section") {
322	$data = "";
323	$section = $doc_obj->get_top_section();
324	while (defined($section)) {
325	$dataref = $doc_obj->get_metadata($section, $field);
326	$data .= join("\n", @$dataref) . "\n";
327	$section = $doc_obj->get_next_section($section);
328	}
329	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
330	}
331
332	# Some sort of specification which I don't understand
333	else {
334	die "Unknown level ($level) in Phind index ($part)\n";
335	}
336
337	}
338
339	# output the text
340	$text =~ tr/\n//s;
341	print $txthandle "$text";
342	}
343
344
345	# Construct the classifier from the information already gathered
346	#
347	# When get_classify_info is called, the clauses and docs.txt files have
348	# already been constructed in the Phind directory. This function will
349	# translate them into compressed, indexed MGPP files that can be read by
350	# the phindcgi script. It will also register our classifier so that it
351	# shows up in the navigation bar.
352
353	sub get_classify_info {
354	my $self = shift (@_);
355	my ($gli) = @_;
356
357	close $self->{'dochandle'};
358	close $self->{'txthandle'};
359	my $verbosity = $self->{'verbosity'};
360	my $out = $self->{'outhandle'};
361	my $phinddir = $self->{'phinddir'};
362
363	my $osextra = "";
364	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
365	$osextra = " -d /";
366	}
367
368	print STDERR "</Stage>\n" if $gli;
369
370	if ($verbosity) {
371	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
372	print $out "*** in ", $self->{'phinddir'}, "\n";
373	}
374
375	print STDERR "<Stage name='Phind'>\n" if $gli;
376
377	# Construct phind indexes
378	my $suffixmode = $self->{'suffixmode'};
379	my $min_occurs = $self->{'min_occurs'};
380	my ($command, $status);
381
382	# Generate the vocabulary, symbol statistics, and numbers file
383	# from the clauses file
384	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
385	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
386	&extract_vocabulary($self);
387
388	# Use the suffix program to generate the phind/phrases file
389	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
390	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
391	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
392
393	# check that we generated some files. It's not necessarily an error if
394	# we didn't (execute() would have quit on error), but we can't go on.
395	my $phrasesfile=&FileUtils::filenameConcatenate($self->{'phinddir'}, 'phrases');
396	if (! -r $phrasesfile) {
397	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
398	print $out "\nNo phrases found for Phind classifier!\n";
399	return;
400	}
401
402	# Create the phrase file and put phrase numbers in phind/phrases
403	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
404	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
405	&renumber_phrases($self);
406
407	print $out "\nCreating phrase databases\n";
408	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
409	my $mg_input = &FileUtils::filenameConcatenate($phinddir, "pdata.txt");
410	my $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pdata");
411
412	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
413	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
414	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
415
416	# create the mg index of words
417	print $out "\nCreating word-level search indexes\n";
418	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
419	$mg_input = &FileUtils::filenameConcatenate($phinddir, "pword.txt");
420	$mg_stem = &FileUtils::filenameConcatenate($phinddir, "pword");
421
422	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
423	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
424	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
425	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
426	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
427	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
428
429	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
430	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
431	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
432
433	# create the mg document information database
434	print $out "\nCreating document information databases\n";
435	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
436	$mg_input = &FileUtils::filenameConcatenate($phinddir, "docs.txt");
437	$mg_stem = &FileUtils::filenameConcatenate($phinddir, "docs");
438
439	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
440	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
441	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
442
443	my $parameters = "phindnumber=$self->{'phindnumber'}";
444	if (defined ($self->{'textorientation'})) {
445	$parameters .= ";textorientation=$self->{'textorientation'}";
446	}
447	# Return the information about the classifier that we'll later want to
448	# use to create macros when the Phind classifier document is displayed.
449	my %classifyinfo = ('thistype'=>'Invisible',
450	'childtype'=>'Phind',
451	'Title'=>$self->{'buttonname'},
452	'parameters'=>$parameters,
453	'contains'=>[]);
454
455	my $collection = $self->{'collection'};
456	my $url = "library?a=p&p=phind&c=$collection";
457	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
458
459	return \%classifyinfo;
460	}
461
462
463
464	sub convert_gml_to_tokens {
465
466	my ($language_exp, $text) = @_;
467
468	# escape any magic words... - jrm21
469	foreach my $delim (@delimiters) {
470	my $replacement=lc($delim);
471	my $num= $text=~ s/$delim/$replacement/g;
472	if (!$num) {$num=0;}
473	}
474
475	if ($language_exp =~ /^en$/) {
476	return &convert_gml_to_tokens_EN($text);
477	}
478
479	if ($language_exp =~ /zh/) {
480	return &convert_gml_to_tokens_ZH($text);
481	}
482
483	$_ = $text;
484
485	# 1. remove GML tags
486
487	# Remove everything that is in a tag
488	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
489	s/\s<br>\s/ LINEBREAK /isgo;
490	s/<[^>]*>/ /sgo;
491
492	# Now we have the text, but it may contain HTML
493	# elements coded as > etc. Remove these tags.
494	s/&/&/sgo;
495	s/</</sgo;
496	s/>/>/sgo;
497	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
498	s/\s<br>\s/ LINEBREAK /isgo;
499	s/<[^>]*>/ /sgo;
500
501	# replace<p> and <br> placeholders with clause break symbol (\n)
502	s/\s+/ /gso;
503	s/PARAGRAPHBREAK/\n/sgo;
504	s/LINEBREAK/\n/sgo;
505
506
507	# 2. Split the remaining text into space-delimited tokens
508
509	# Convert any HTML special characters (like ") to their UTF8 equivalent
510	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
511
512	# Split text at word boundaries
513	s/\b/ /go;
514
515	# 3. Convert the remaining text to "clause format"
516
517	# Insert newline if the end of a sentence is detected
518	# (delimter is: "[\.\?\!]\s")
519	# s/\s*[\.\?\!]\s+/\n/go;
520
521	# remove unnecessary punctuation and replace with clause break symbol (\n)
522	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
523	#s/[^\w ]/\n/go;
524	# replace punct with new lines - is this what we want??
525	s/\s[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s/\n/go; #"
526	# then remove other punct with space
527	s/[\'\`\\\_]/ /go;
528
529	# remove extraneous whitespace
530	s/ +/ /sgo;
531	s/^\s+//mgo;
532	s/\s*$/\n/mgo;
533
534	# remove lines that contain one word or less
535	s/^\S*$//mgo;
536	s/^\s*$//mgo;
537	tr/\n//s;
538
539	return $_;
540	}
541
542	# a chinese version
543	sub convert_gml_to_tokens_ZH {
544
545	$_ = shift @_;
546
547	# Replace all whitespace with a simple space
548	s/\s+/ /gs;
549	# Remove everything that is in a tag
550	s/\s<p>\s/ PARAGRAPHBREAK /isg;
551	s/\s<br>\s/ LINEBREAK /isg;
552	s/<[^>]*>/ /sg;
553
554	# Now we have the text, but it may contain HTML
555	# elements coded as > etc. Remove these tags.
556	s/</</sg;
557	s/>/>/sg;
558
559	s/\s+/ /sg;
560	s/\s<p>\s/ PARAGRAPHBREAK /isg;
561	s/\s<br>\s/ LINEBREAK /isg;
562	s/<[^>]*>/ /sg;
563
564	# remove & and other miscellaneous markup tags
565	s/&/&/sg;
566	s/</</sg;
567	s/>/>/sg;
568	s/&/&/sg;
569
570	# replace<p> and <br> placeholders with carriage returns
571	s/PARAGRAPHBREAK/\n/sg;
572	s/LINEBREAK/\n/sg;
573
574
575	# print STDERR "text:$_\n";
576	return $_;
577	}
578
579	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
580
581	sub convert_gml_to_tokens_EN {
582	$_ = shift @_;
583
584	# FIRST, remove GML tags
585
586	# Replace all whitespace with a simple space
587	s/\s+/ /gs;
588
589	# Remove everything that is in a tag
590	s/\s<p>\s/ PARAGRAPHBREAK /isg;
591	s/\s<br>\s/ LINEBREAK /isg;
592	s/<[^>]*>/ /sg;
593
594	# Now we have the text, but it may contain HTML
595	# elements coded as > etc. Remove these tags.
596	s/</</sg;
597	s/>/>/sg;
598
599	s/\s+/ /sg;
600	s/\s<p>\s/ PARAGRAPHBREAK /isg;
601	s/\s<br>\s/ LINEBREAK /isg;
602	s/<[^>]*>/ /sg;
603
604	# remove & and other miscellaneous markup tags
605	s/&/&/sg;
606	s/</</sg;
607	s/>/>/sg;
608	s/&/&/sg;
609
610	# replace<p> and <br> placeholders with carriage returns
611	s/PARAGRAPHBREAK/\n/sg;
612	s/LINEBREAK/\n/sg;
613
614
615	# Exceptional punctuation
616	#
617	# We make special cases of some punctuation
618
619	# remove any apostrophe that indicates omitted letters
620	s/(\w+)\'(\w*\s)/ $1$2 /g;
621
622	# remove period that appears in a person's initals
623	s/\s([A-Z])\./ $1 /g;
624
625	# replace hyphens in hypheanted words and names with a space
626	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
627
628	# Convert the remaining text to "clause format",
629	# This means removing all excess punctuation and garbage text,
630	# normalising valid punctuation to fullstops and commas,
631	# then putting one cluse on each line.
632
633	# Insert newline when the end of a sentence is detected
634	# (delimter is: "[\.\?\!]\s")
635	s/\s*[\.\?\!]\s+/\n/g;
636
637	# split numbers after four digits
638	s/(\d\d\d\d)/$1 /g;
639
640	# split words after 32 characters
641
642	# squash repeated punctuation
643	tr/A-Za-z0-9 //cs;
644
645	# save email addresses
646	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
647
648	# normalise clause breaks (mostly punctuation symbols) to commas
649	s/[^A-Za-z0-9 \n]+/ , /g;
650
651	# Remove repeated commas, and replace with newline
652	s/\s*,[, ]+/\n/g;
653
654	# remove extra whitespace
655	s/ +/ /sg;
656	s/^\s+//mg;
657	s/\s*$/\n/mg;
658
659	# remove lines that contain one word or less
660	s/^\w*$//mg;
661	s/^\s*$//mg;
662	tr/\n//s;
663
664	return $_;
665
666	}
667
668
669
670	# Execute a system command
671
672	sub execute {
673	my ($command, $verbosity, $outhandle) = @_;
674	print $outhandle "Executing: $command\n" if ($verbosity > 2);
675	$! = 0;
676	my $status = system($command);
677	if ($status != 0) {
678	print STDERR "Phind - Error executing '$command': $!\n";
679	exit($status); # this causes the build to fail...
680	}
681	}
682
683
684	# Generate the vocabulary, symbol statistics, and numbers file from the
685	# clauses file. This is legacy code, so is a bit messy and probably wont
686	# run under windows.
687
688	sub extract_vocabulary {
689	my ($self) = @_;
690
691	my $verbosity = $self->{'verbosity'};
692	my $out = $self->{'outhandle'};
693
694	my $collectiondir = $self->{'collectiondir'};
695	my $phinddir = $self->{'phinddir'};
696
697	my $language_exp = $self->{'language'};
698
699	my ($w, $l, $line, $word);
700
701	my ($first_delimiter, $last_delimiter,
702	$first_stopword, $last_stopword,
703	$first_extractword, $last_extractword,
704	$first_contentword, $last_contentword,
705	$phrasedelimiter);
706
707	my $thesaurus = $self->{'thesaurus'};
708	my ($thesaurus_links, $thesaurus_terms,
709	%thesaurus, $first_thesaurusword, $last_thesaurusword);
710
711	my %symbol;
712	my (%freq);
713
714	print $out "Calculating vocabulary\n" if ($verbosity > 1);
715
716	# Read and store the stopwords
717	my $stopdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
718	my $stopword_files = ();
719	my ($language, $language_dir, $file, $file_name);
720	my %stopwords;
721
722	# Examine each directory in the stopword directory
723	opendir(STOPDIR, $stopdir);
724	foreach $language (readdir STOPDIR) {
725
726	# Ignore entries that do not match the classifier's language
727	next unless ($language =~ /$language_exp/);
728	$language_dir = &FileUtils::filenameConcatenate($stopdir, $language);
729	next unless (-d "$language_dir");
730
731	opendir(LANGDIR, $language_dir);
732	foreach $file (readdir LANGDIR) {
733
734	# Ignore entries that are not stopword files
735	next unless ($file =~ /sw$/);
736	$file_name = &FileUtils::filenameConcatenate($language_dir, $file);
737	next unless (-f "$file_name");
738
739	# Read the stopwords
740	open(STOPFILE, "<$file_name");
741	while (<STOPFILE>) {
742	s/^\s+//;
743	s/\s.*//;
744	$word = $_;
745	$l = lc($word);
746	$stopwords{$l} = $word;
747	}
748	close STOPFILE;
749
750	}
751	closedir LANGDIR;
752	}
753	closedir STOPDIR;
754
755	# Read thesaurus information
756	if ($thesaurus) {
757
758	# link file exists
759	$thesaurus_links = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.lnk");
760	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
761
762	# ensure term file exists in the correct language
763	if ($language_exp =~ /^([a-z][a-z])/) {
764	$language = $1;
765	} else {
766	$language = 'en';
767	}
768	$thesaurus_terms = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.$language");
769	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
770
771
772	# Read the thesaurus terms
773	open(TH, "<$thesaurus_terms");
774	while(<TH>) {
775	s/^\d+ //;
776	s/\(.*\)//;
777	foreach $w (split(/\s+/, $_)) {
778	$thesaurus{lc($w)} = $w;
779	}
780	}
781	close TH;
782	}
783
784	# Read words in the text and count occurences
785	open(TXT, "<$phinddir/clauses");
786
787	my @words;
788	while(<TXT>) {
789	$line = $_;
790	next unless ($line =~ /./);
791
792	@words = split(/\s+/, $line);
793	foreach $w (@words) {
794	$l = lc($w);
795	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
796	$freq{$w}++;
797	}
798	$freq{$senlimit}++;
799	}
800
801	close TXT;
802
803	# Calculate the "best" form of each word
804	my (%bestform, %totalfreq, %bestfreq);
805
806	foreach $w (sort (keys %freq)) {
807	$l = lc($w);
808
809	# totalfreq is the number of times a term appears in any form
810	$totalfreq{$l} += $freq{$w};
811
812	if (defined $stopwords{$l}) {
813	$bestform{$l} = $stopwords{$l};
814
815	} elsif (defined $thesaurus{$l}) {
816	$bestform{$l} = $thesaurus{$l};
817
818	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
819	$bestfreq{$l} = $freq{$w};
820	$bestform{$l} = $w;
821	}
822	}
823	undef %freq;
824	undef %bestfreq;
825
826
827	# Assign symbol numbers to tokens
828	my $nextsymbol = 1;
829	my (@vocab);
830
831	# Delimiters
832	$first_delimiter = 1;
833
834	foreach $word (@delimiters) {
835
836	# $word = lc($word); # jrm21
837	$word = uc($word);
838	$bestform{$word} = $word;
839	$vocab[$nextsymbol] = $word;
840	$symbol{$word} = $nextsymbol;
841	$nextsymbol++;
842	}
843	$last_delimiter = $nextsymbol - 1;
844	# Stopwords
845	$first_stopword = $nextsymbol;
846
847	foreach my $word (sort keys %stopwords) {
848	# don't include stopword unless it occurs in the text
849	$word = lc($word);
850	next unless ($totalfreq{$word});
851	next if ($symbol{$word});
852
853	$vocab[$nextsymbol] = $word;
854	$symbol{$word} = $nextsymbol;
855	$nextsymbol++;
856	}
857	$last_stopword = $nextsymbol - 1;
858	$first_contentword = $nextsymbol;
859
860	# Thesaurus terms
861	if ($thesaurus) {
862	$first_thesaurusword = $nextsymbol;
863
864	foreach my $word (sort keys %thesaurus) {
865
866	$word = lc($word);
867	next if ($symbol{$word});
868	$bestform{$word} = $thesaurus{$word};
869
870	$vocab[$nextsymbol] = $word;
871	$symbol{$word} = $nextsymbol;
872	$nextsymbol++;
873
874	}
875	$last_thesaurusword = $nextsymbol - 1;
876	}
877
878	# Other content words
879	$first_extractword = $nextsymbol;
880
881	foreach my $word (sort (keys %bestform)) {
882
883	next if ($symbol{$word});
884
885	$vocab[$nextsymbol] = $word;
886	$symbol{$word} = $nextsymbol;
887	$nextsymbol++;
888	}
889	$last_extractword = $nextsymbol - 1;
890	$last_contentword = $nextsymbol - 1;
891
892	# Outut the words
893	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
894	open(VOC, ">$phinddir/clauses.vocab");
895
896	for (my $i = 1; $i < $nextsymbol; $i++) {
897	$w = $vocab[$i];
898
899	print VOC "$bestform{$w}\n";
900	$totalfreq{$w} = 0 unless ($totalfreq{$w});
901	}
902	close VOC;
903
904
905	# Create statistics file
906	# Output statistics about the vocablary
907	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
908	&FileUtils::removeFiles("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
909
910	open(STAT, ">$phinddir/clauses.stats")
911	\|\| die "Cannot open $phinddir/clauses.stats: $!";
912
913	print STAT "first_delimiter $first_delimiter\n";
914	print STAT "last_delimiter $last_delimiter\n";
915	print STAT "first_stopword $first_stopword\n";
916	print STAT "last_stopword $last_stopword\n";
917	if ($thesaurus) {
918	print STAT "first_thesaurusword $first_thesaurusword\n";
919	print STAT "last_thesaurusword $last_thesaurusword\n";
920	}
921	print STAT "first_extractword $first_extractword\n";
922	print STAT "last_extractword $last_extractword\n";
923	print STAT "first_contentword $first_contentword\n";
924	print STAT "last_contentword $last_contentword\n";
925	print STAT "first_symbol $first_delimiter\n";
926	print STAT "last_symbol $last_contentword\n";
927	print STAT "first_word $first_stopword\n";
928	print STAT "last_word $last_contentword\n";
929	close STAT;
930
931	undef @vocab;
932
933
934	# Create numbers file
935	# Save text as symbol numbers
936	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
937
938	open(TXT, "<$phinddir/clauses");
939	open(NUM, ">$phinddir/clauses.numbers");
940
941	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
942	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
943	$phrasedelimiter = $symbol{$senlimit};
944	print NUM "$symbol{$colstart}\n";
945
946	# set up the special symbols that delimit documents and sentences
947	while(<TXT>) {
948
949	# split sentence into a list of tokens
950	$line = $_;
951	next unless ($line =~ /./);
952	@words = split(/\s+/, $line);
953
954	# output one token at a time
955	foreach $word (@words) {
956	# don't lower-case special delimiters - jrm21
957	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
958	$word = lc($word);
959	}
960	print NUM "$symbol{$word}\n";
961	}
962
963	# output phrase delimiter
964	print NUM "$phrasedelimiter\n";
965	}
966
967	close TXT;
968	# print NUM "$symbol{lc($colend)}\n";# jrm21
969	print NUM "$symbol{$colend}\n";
970	close NUM;
971
972	# Save thesaurus data in one convienient file
973	if ($thesaurus) {
974
975	my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
976
977
978	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
979	if ($verbosity > 1);
980
981	# Read the thesaurus terms
982	my ($num, $text, %thes_symbols);
983
984	open(TH, "<$thesaurus_terms");
985	while(<TH>) {
986	chomp;
987	@words = split(/\s+/, $_);
988	$num = shift @words;
989	$text = "";
990
991	# translate words into symbol numbers
992	foreach $word (@words) {
993	$word = lc($word);
994	if ($symbol{$word}) {
995	$text .= "s$symbol{$word} ";
996	} elsif ($verbosity) {
997	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
998	}
999	}
1000	$text =~ s/ $//;
1001	$thes_symbols{$num} = $text;
1002	}
1003	close TH;
1004
1005	# Read the thesaurus links and write the corresponding data
1006	open(TH, "<$thesaurus_links");
1007	open(THOUT, ">$thesaurusfile");
1008
1009	while(<TH>) {
1010	chomp;
1011	($num, $text) = split(/:/, $_);
1012
1013	if (defined($thes_symbols{$num})) {
1014	print THOUT "$num:$thes_symbols{$num}:$text\n";
1015	} else {
1016	print THOUT "$num:untranslated:$text\n";
1017	}
1018	}
1019	close TH;
1020	close THOUT;
1021	}
1022
1023
1024
1025
1026	}
1027
1028
1029	# renumber_phrases
1030	#
1031	# Prepare the phrases file to be input to mgpp. The biggest problem is
1032	# reconciling the phrase identifiers used by the suffix program (which
1033	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1034	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1035	# Phind-id numbers must be sorted by frequency of occurance.
1036	#
1037	# Start creating a set of phind-id numbers from the sorted suffix-id
1038	# numbers and (if required) the thesaurus-id numbers. Then add any other
1039	# phrases occuring in the thesaurus.
1040	#
1041	# The last thing we have to do is restore the vocabulary information to the
1042	# phrase file so that the phrases are stored as words, not as symbol
1043	# numbers.
1044
1045	# The original phrases file looks something like this:
1046	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1047	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1048	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1049	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1050
1051
1052	sub renumber_phrases {
1053	my ($self) = @_;
1054
1055	renumber_suffix_data($self);
1056	renumber_thesaurus_data($self);
1057	restore_vocabulary_data($self);
1058
1059	}
1060
1061
1062
1063	# renumber_suffix_data
1064	#
1065	# Translate phrases file to phrases.2 using phind keys instead
1066	# of suffix keys and sorting the expansion data.
1067
1068	sub renumber_suffix_data {
1069	my ($self) = @_;
1070
1071	my $verbosity = $self->{'verbosity'};
1072	my $out = $self->{'outhandle'};
1073	print $out "Translate phrases: suffix-ids become phind-id's\n"
1074	if ($verbosity);
1075
1076	my $phinddir = $self->{'phinddir'};
1077	my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases');
1078	my $outfile = &FileUtils::filenameConcatenate($phinddir, 'phrases.2');
1079
1080	# Read the phrase file. Calculate initial set of phind-id
1081	# numbers and store (suffixid -> frequency) relation.
1082
1083	my %suffixtophind;
1084	my @totalfrequency;
1085	my (@fields, $suffixid);
1086	my $nextphind = 1;
1087
1088	open(IN, "<$infile");
1089	while(<IN>) {
1090
1091	chomp;
1092	@fields = split(/:/, $_);
1093
1094	# get next suffixid and phindid
1095	$suffixid = shift @fields;
1096	$suffixtophind{$suffixid} = $nextphind;
1097
1098	# store total frequency
1099	shift @fields;
1100	$totalfrequency[$nextphind] = shift @fields;
1101
1102	$nextphind++;
1103	}
1104	close IN;
1105
1106
1107	# Translate phrases file to phrases.2. Use phind keys (not suffix
1108	# keys), sort expansion and document occurance data in order of
1109	# descending frequency..
1110	open(IN, "<$infile");
1111	open(OUT, ">$outfile");
1112
1113	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1114	my (@documwents, @newexp, $k, $n);
1115	my $linenumber = 0;
1116
1117	while(<IN>) {
1118
1119	# read the line
1120	chomp;
1121	@fields = split(/:/, $_);
1122
1123	# get a phrase number for this line
1124	$suffixid = shift @fields;
1125	die unless (defined($suffixtophind{$suffixid}));
1126	$phindid = $suffixtophind{$suffixid};
1127
1128	# get the symbols in the phrase
1129	$text = shift @fields;
1130
1131	# output status information
1132	$linenumber++;
1133	if ($verbosity > 2) {
1134	if ($linenumber % 1000 == 0) {
1135	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1136	}
1137	# what are $num and $key??
1138	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1139	}
1140
1141	# get the phrase frequency
1142	$tf = shift @fields;
1143
1144	# get the number of expansions
1145	$countexp = shift @fields;
1146
1147	# get the expansions, convert them into phind-id numbers, and sort them
1148	$expansions = shift @fields;
1149	@newexp = ();
1150	foreach $k (split(/,/, $expansions)) {
1151	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1152	$n = $suffixtophind{$k};
1153	push @newexp, $n;
1154	}
1155	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1156
1157	# get the number of documents
1158	$countdocs = shift @fields;
1159
1160	# get the documents and sort them
1161	$documents = shift @fields;
1162	$documents =~ s/d//g;
1163	my @documents = split(/;/, $documents);
1164	@documents = sort by_doc_frequency @documents;
1165
1166	# output the phrase data
1167	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1168	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1169
1170	}
1171
1172	close IN;
1173	close OUT;
1174	}
1175
1176
1177	# renumber_thesaurus_data
1178	#
1179	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1180
1181	sub renumber_thesaurus_data {
1182	my ($self) = @_;
1183
1184	my $out = $self->{'outhandle'};
1185	my $verbosity = $self->{'verbosity'};
1186	my $thesaurus = $self->{'thesaurus'};
1187
1188	my $phinddir = $self->{'phinddir'};
1189	my $infile = &FileUtils::filenameConcatenate($phinddir, "phrases.2");
1190	my $outfile = &FileUtils::filenameConcatenate($phinddir, "phrases.3");
1191
1192
1193	# If no thesaurus is defined, simply move the phrases file.
1194	if (!$thesaurus) {
1195	print $out "Translate phrases.2: no thesaurus data\n"
1196	if ($verbosity);
1197	&FileUtils::moveFiles($infile, $outfile);
1198	return;
1199	}
1200
1201	print $out "Translate phrases.2: add thesaurus data\n"
1202	if ($verbosity);
1203
1204	# 1.
1205	# Read thesaurus file and store (symbols->thesaurusid) mapping
1206	my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
1207	my %symbolstothesid;
1208	my (@fields, $thesid, $symbols);
1209
1210	open(TH, "<$thesaurusfile");
1211
1212	while (<TH>) {
1213
1214	chomp;
1215	@fields = split(/:/, $_);
1216
1217	# get id and text
1218	$thesid = shift @fields;
1219	$symbols = shift @fields;
1220	$symbolstothesid{$symbols} = $thesid;
1221	}
1222	close TH;
1223
1224	# 2.
1225	# Read phrases file to find thesaurus entries that already
1226	# have a phindid. Store their phind-ids for later translation,
1227	# and store their frequency for later sorting.
1228	my %thesaurustophindid;
1229	my %phindidtofrequency;
1230	my ($phindid, $freq);
1231
1232	open(IN, "<$infile");
1233
1234	while(<IN>) {
1235
1236	chomp;
1237	@fields = split(/:/, $_);
1238
1239	# phindid and symbols for this line
1240	$phindid = shift @fields;
1241	$symbols = shift @fields;
1242	$freq = shift @fields;
1243
1244	# do we have a thesaurus id corresponding to this phrase?
1245	if (defined($symbolstothesid{$symbols})) {
1246	$thesid = $symbolstothesid{$symbols};
1247	$thesaurustophindid{$thesid} = $phindid;
1248	$phindidtofrequency{$phindid} = $freq;
1249	}
1250	}
1251	close IN;
1252
1253	undef %symbolstothesid;
1254
1255	# 3.
1256	# Create phind-id numbers for remaining thesaurus entries,
1257	# and note that their frequency is 0 for later sorting.
1258	my $nextphindid = $phindid + 1;
1259
1260	open(TH, "<$thesaurusfile");
1261	while(<TH>) {
1262
1263	chomp;
1264	@fields = split(/:/, $_);
1265
1266	# read thesaurus-id and ensure it has a corresponding phind-id
1267	$thesid = shift @fields;
1268	if (!defined($thesaurustophindid{$thesid})) {
1269	$thesaurustophindid{$thesid} = $nextphindid;
1270	$phindidtofrequency{$nextphindid} = 0;
1271	$nextphindid++;
1272	}
1273	}
1274	close TH;
1275
1276	# 4.
1277	# Translate thesaurus file, replacing thesaurus-id numbers with
1278	# phind-id numbers.
1279	my $newthesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.phindid");
1280	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1281	my (@links, $linkid, %linkidtotype, $newrelation);
1282
1283	open(TH, "<$thesaurusfile");
1284	open(TO, ">$newthesaurusfile");
1285	while(<TH>) {
1286
1287	chomp;
1288	@fields = split(/:/, $_);
1289
1290	# phindid and symbols for this line
1291	($thesid, $symbols, $relations) = @fields;
1292
1293	die unless ($thesid && $symbols);
1294	die unless $thesaurustophindid{$thesid};
1295	$phindid = $thesaurustophindid{$thesid};
1296
1297	# convert each part of the relation string to use phind-id numbers
1298	# at the same time, we want to sort the list by frequency.
1299	undef %linkidtotype;
1300
1301	foreach $linktext (split(/;/, $relations)) {
1302	@linkdata = split(/,/, $linktext);
1303
1304	# remember the linktype (e.g. BT, NT)
1305	$linktype = shift @linkdata;
1306
1307	# store the type of each link
1308	foreach $thesid (@linkdata) {
1309	die unless (defined($thesaurustophindid{$thesid}));
1310	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1311	}
1312	}
1313
1314	# sort the list of links, first by frequency, then by type.
1315	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1316	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1317	$linkcounter = (scalar @links);
1318
1319	# create a string describing the link information
1320	$linktype = $linkidtotype{$links[0]};
1321	$newrelation = $linktype;
1322	foreach $linkid (@links) {
1323	if ($linkidtotype{$linkid} ne $linktype) {
1324	$linktype = $linkidtotype{$linkid};
1325	$newrelation .= ";" . $linktype;
1326	}
1327	$newrelation .= "," . $linkid;
1328	}
1329	$newrelation .= ";";
1330
1331
1332	# output the new line
1333	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1334	}
1335	close TH;
1336	close TO;
1337
1338	undef %thesaurustophindid;
1339	undef %linkidtotype;
1340	undef %phindidtofrequency;
1341
1342	# 5.
1343	# Read thesaurus data (in phind-id format) into memory
1344	my %thesaurusdata;
1345
1346	open(TH, "<$newthesaurusfile");
1347	while(<TH>) {
1348	chomp;
1349	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1350	die unless ($phindid && $symbols);
1351	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1352	}
1353	close TH;
1354
1355	# 6.
1356	# Add thesaurus data to phrases file
1357	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1358	my (@documwents, @newexp, $k, $n);
1359	my $linenumber = 0;
1360
1361	open(IN, "<$infile");
1362	open(OUT, ">$outfile");
1363
1364	# Update existing phrases
1365	while(<IN>) {
1366
1367	chomp;
1368	@fields = split(/:/, $_);
1369
1370	# get data for this line
1371	$phindid = shift @fields;
1372
1373	# output the phrase data, with thesaurus information
1374	print OUT "$phindid:", join(":", @fields);
1375
1376	# add thesaurus data
1377	if (defined($thesaurusdata{$phindid})) {
1378	@fields = split(/:/, $thesaurusdata{$phindid});
1379	shift @fields;
1380	$linkcounter = shift @fields;
1381	$relations = shift @fields;
1382
1383	print OUT ":$linkcounter:$relations";
1384	$thesaurusdata{$phindid} = "";
1385	}
1386	print OUT "\n";
1387	}
1388	close IN;
1389
1390	# Add phrases that aren't already in the file
1391	foreach $phindid (sort numerically keys %thesaurusdata) {
1392	next unless ($thesaurusdata{$phindid});
1393
1394	@fields = split(/:/, $thesaurusdata{$phindid});
1395	$symbols = shift @fields;
1396	$linkcounter = shift @fields;
1397	$relations = shift @fields;
1398
1399	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1400	}
1401	close OUT;
1402
1403	}
1404
1405	# restore_vocabulary_data
1406	#
1407	# Read phrases.3 and restore vocabulary information. Then write
1408	# this data to the MGPP input files (pword.txt and pdata.txt) and
1409	# (if requested) to the saved phrases file.
1410
1411	sub restore_vocabulary_data {
1412	my ($self) = @_;
1413
1414	my $out = $self->{'outhandle'};
1415	my $verbosity = $self->{'verbosity'};
1416	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1417
1418	my $phinddir = $self->{'phinddir'};
1419	my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases.3');
1420	my $vocabfile = &FileUtils::filenameConcatenate($phinddir, 'clauses.vocab');
1421	my $datafile = &FileUtils::filenameConcatenate($phinddir, 'pdata.txt');
1422	my $wordfile = &FileUtils::filenameConcatenate($phinddir, 'pword.txt');
1423
1424	my $savephrases = $self->{'savephrases'};
1425
1426	# 1.
1427	# Read the vocabulary file
1428	open(V, "<$vocabfile")
1429	\|\| die "Cannot open $vocabfile: $!";
1430	my @symbol;
1431	my $i = 1;
1432	while(<V>) {
1433	chomp;
1434	$symbol[$i++] = $_;
1435	}
1436	close V;
1437
1438	# 2.
1439	# Translate phrases.3 to MGPP input files
1440	my ($key, $text, $word, $isThesaurus, $line);
1441	my @fields;
1442	my $linenumber = 0;
1443
1444	open(IN, "<$infile");
1445	open(DATA, ">$datafile");
1446	open(WORD, ">$wordfile");
1447
1448	# Save the phrases in a separate text file
1449	if ($savephrases) {
1450	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1451	open(SAVE, ">$savephrases");
1452	}
1453
1454	while(<IN>) {
1455
1456	# read the line
1457	chomp;
1458	$line = $_;
1459	@fields = split(/:/, $line);
1460
1461	# get a phrase number for this line
1462	$key = shift @fields;
1463
1464	# restore the text of the phrase
1465	$text = shift @fields;
1466	$text =~ s/s(\d+)/$symbol[$1]/g;
1467	if ($text =~ / /) {
1468	$word = "";
1469	} elsif ($text ne 'untranslated') {
1470	$word = $text;
1471	}
1472
1473	# output the phrase data
1474	print DATA "<Document>";
1475	print DATA "$key:$text:", join(":", @fields), ":\n";
1476
1477	# output the word index search data
1478	print WORD "<Document>$word\n";
1479
1480	# output the phrases to a text file
1481	if ($savephrases) {
1482	if ((scalar @fields) == 7) {
1483	$isThesaurus = 1;
1484	} else {
1485	$isThesaurus = 0;
1486	}
1487	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1488	}
1489	}
1490	close IN;
1491	close WORD;
1492	close DATA;
1493	close SAVE if ($savephrases);
1494
1495	}
1496
1497
1498
1499	# sort routines used to renumber phrases
1500
1501	sub numerically { $a <=> $b }
1502
1503	sub by_doc_frequency {
1504	my $fa = 1;
1505	if ($a =~ /,/) {
1506	$fa = $a;
1507	$fa =~ s/\d+,//;
1508	}
1509	my $fb = 1;
1510	if ($b =~ /,/) {
1511	$fb = $b;
1512	$fb =~ s/\d+,//;
1513	}
1514
1515	return ($fb <=> $fa);
1516	}
1517
1518	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: