Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/classify/Phind.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago
An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)
File size: 40.9 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	package Phind;
33
34	use BaseClassifier;
35	use util;
36	use ghtml;
37	use unicode;
38
39	use strict;
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	my @removedirs = ();
43
44	my %wanted_index_files = ('td'=>1,
45	't'=>1,
46	'ti'=>1,
47	'tl'=>1,
48	'tsd'=>1,
49	'idb'=>1,
50	'ib1'=>1,
51	'ib2'=>1,
52	'ib3'=>1,
53	'i'=>1,
54	'il'=>1,
55	'w'=>1,
56	'wa'=>1);
57
58	sub BEGIN {
59	@Phind::ISA = ('BaseClassifier');
60	}
61
62	sub END {
63
64	# Tidy up stray files - we do this here as there's some weird problem
65	# preventing us from doing it in the get_classify_info() function (on
66	# windows at least) where the close() appears to fail on txthandle and
67	# dochandle, thus preventing us from deleting those files
68
69	foreach my $dir (@removedirs) {
70	if (-d $dir && opendir (DIR, $dir)) {
71	my @files = readdir DIR;
72	closedir DIR;
73
74	foreach my $file (@files) {
75	next if $file =~ /^\.\.?$/;
76	my ($suffix) = $file =~ /\.([^\.]+)$/;
77	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
78	# delete it!
79	&util::rm (&util::filename_cat ($dir, $file));
80	}
81	}
82	}
83	}
84	}
85
86	my $arguments =
87	[ { 'name' => "text",
88	'desc' => "{Phind.text}",
89	'type' => "string",
90	'deft' => "section:Title,section:text",
91	'reqd' => "no" },
92	{ 'name' => "title",
93	'desc' => "{Phind.title}",
94	'type' => "metadata",
95	'deft' => "Title",
96	'reqd' => "no" },
97	{ 'name' => "buttonname",
98	'desc' => "{BasClas.buttonname}",
99	'type' => "string",
100	'deft' => "Phrase",
101	'reqd' => "no" },
102	{ 'name' => "language",
103	'desc' => "{Phind.language}",
104	'type' => "string",
105	'deft' => "en",
106	'reqd' => "no" },
107	{ 'name' => "savephrases",
108	'desc' => "{Phind.savephrases}",
109	'type' => "string",
110	'deft' => "",
111	'reqd' => "no" },
112	{ 'name' => "suffixmode",
113	'desc' => "{Phind.suffixmode}",
114	'type' => "int",
115	'deft' => "1",
116	'range' => "0,1",
117	'reqd' => "no" },
118	{ 'name' => "min_occurs",
119	'desc' => "{Phind.min_occurs}",
120	'type' => "int",
121	'deft' => "2",
122	'range' => "1,",
123	'reqd' => "no" },
124	{ 'name' => "thesaurus",
125	'desc' => "{Phind.thesaurus}",
126	'type' => "string",
127	'deft' => "",
128	'reqd' => "no" },
129	{ 'name' => "untidy",
130	'desc' => "{Phind.untidy}",
131	'type' => "flag",
132	'reqd' => "no" } ];
133
134	my $options = { 'name' => "Phind",
135	'desc' => "{Phind.desc}",
136	'abstract' => "no",
137	'inherits' => "yes",
138	'args' => $arguments };
139
140
141	# Phrase delimiter symbols - these should be abstracted out someplace
142
143	my $colstart = "COLLECTIONSTART";
144	my $colend = "COLLECTIONEND";
145	my $doclimit = "DOCUMENTLIMIT";
146	my $senlimit = "SENTENCELIMIT";
147	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150	# Create a new Phind browser based on collect.cfg
151
152	sub new {
153	my ($class) = shift (@_);
154	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155	push(@$classifierslist, $class);
156
157	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158	push(@{$hashArgOptLists->{"OptList"}},$options);
159
160	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
161
162	if ($self->{'info_only'}) {
163	# don't worry about any options etc
164	return bless $self, $class;
165	}
166
167	# Ensure the Phind generate scripts are in place
168	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171	if (!(-e $file1)) {
172	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173	exit(1);
174	}
175
176	# things that may have ex. in them that need to be stripped off
177	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
180	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
181
182	$self->{"indexes"} = $self->{"text"};
183
184	# Further setup
185	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
187	if (! defined $self->{'builddir'}) {
188	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189	}
190	$self->{'total'} = 0;
191
192	# we set phind to be rtl if language is arabic
193	if ($self->{'language'} eq "ar") {
194	$self->{'textorientation'} = "rtl";
195	}
196	# Clean out the unused keys
197	delete $self->{"text"};
198
199	return bless $self, $class;
200	}
201
202
203	# Initialise the Phind classifier
204
205	sub init {
206	my $self = shift (@_);
207
208	# ensure we have a build directory
209	my $builddir = $self->{'builddir'};
210	die unless (-e "$builddir");
211
212	# create Phind directory
213	my $phnumber = 1;
214	my $phinddir = &util::filename_cat($builddir, "phind1");
215	while (-e "$phinddir") {
216	$phnumber++;
217	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
218	}
219	&util::mk_dir("$phinddir");
220	$self->{'phinddir'} = $phinddir;
221	$self->{'phindnumber'} = $phnumber;
222
223	push(@removedirs, $phinddir) unless $self->{'untidy'};
224
225	# open filehandles for documents and text
226	my $clausefile = &util::filename_cat("$phinddir", "clauses");
227	&util::rm($clausefile) if (-e $clausefile);
228
229	my $txthandle = 'TEXT' . $phnumber;
230	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
231	$self->{'txthandle'} = $txthandle;
232
233	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234	&util::rm($docfile) if (-e $docfile);
235
236	my $dochandle = 'DOC' . $phnumber;
237	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
238	$self->{'dochandle'} = $dochandle;
239
240	}
241
242
243	# Classify each document.
244	#
245	# Each document is passed here in turn. The classifier extracts the
246	# text of each and stores it in the clauses file. Document details are
247	# stored in the docs.txt file.
248
249	sub classify {
250	my $self = shift (@_);
251	my ($doc_obj) = @_;
252
253	my $verbosity = $self->{'verbosity'};
254	my $top_section = $doc_obj->get_top_section();
255
256	my $titlefield = $self->{'title'};
257
258	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
259	if (!defined($title)) {
260	$title = "";
261	print STDERR "Phind: document has no title\n";
262	}
263	print "process: $title\n" if ($verbosity > 2);
264
265	# Only consider the file if it is in the correct language
266	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
267	my $phrlanguage = $self->{'language'};
268	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
269
270	# record this file
271	$self->{'total'} ++;
272	# what is $file ???
273	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
274
275	# Store document details
276	my $OID = $doc_obj->get_OID();
277	$OID = "NULL" unless defined $OID;
278	my $dochandle = $self->{'dochandle'};
279	print $dochandle "<Document>\t$OID\t$title\n";
280
281	# Store the text occuring in this object
282
283	# output the document delimiter
284	my $txthandle = $self->{'txthandle'};
285	print $txthandle "$doclimit\n";
286
287	# iterate over the required indexes and store their text
288	my $indexes = $self->{'indexes'};
289	my $text = "";
290	my ($part, $level, $field, $section, $data, $dataref);
291
292	foreach $part (split(/,/, $indexes)) {
293
294	# Each field has a level and a data element ((e.g. document:Title)
295	($level, $field) = split(/:/, $part);
296	die unless ($level && $field);
297
298	# Extract the text from every section
299	# (In phind, document:text and section:text are equivalent)
300	if ($field eq "text") {
301	$data = "";
302	$section = $doc_obj->get_top_section();
303	while (defined($section)) {
304	$data .= $doc_obj->get_text($section) . "\n";
305	$section = $doc_obj->get_next_section($section);
306	}
307	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
308	}
309
310	# Extract a metadata field from a document
311	# (If there is more than one element of the given type, get them all.)
312	elsif ($level eq "document") {
313	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
314	foreach $data (@$dataref) {
315	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
316	}
317	}
318
319	# Extract metadata from every section in a document
320	elsif ($level eq "section") {
321	$data = "";
322	$section = $doc_obj->get_top_section();
323	while (defined($section)) {
324	$dataref = $doc_obj->get_metadata($section, $field);
325	$data .= join("\n", @$dataref) . "\n";
326	$section = $doc_obj->get_next_section($section);
327	}
328	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
329	}
330
331	# Some sort of specification which I don't understand
332	else {
333	die "Unknown level ($level) in Phind index ($part)\n";
334	}
335
336	}
337
338	# output the text
339	$text =~ tr/\n//s;
340	print $txthandle "$text";
341	}
342
343
344	# Construct the classifier from the information already gathered
345	#
346	# When get_classify_info is called, the clauses and docs.txt files have
347	# already been constructed in the Phind directory. This function will
348	# translate them into compressed, indexed MGPP files that can be read by
349	# the phindcgi script. It will also register our classifier so that it
350	# shows up in the navigation bar.
351
352	sub get_classify_info {
353	my $self = shift (@_);
354	my ($gli) = @_;
355
356	close $self->{'dochandle'};
357	close $self->{'txthandle'};
358	my $verbosity = $self->{'verbosity'};
359	my $out = $self->{'outhandle'};
360	my $phinddir = $self->{'phinddir'};
361
362	my $osextra = "";
363	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
364	$osextra = " -d /";
365	}
366
367	print STDERR "</Stage>\n" if $gli;
368
369	if ($verbosity) {
370	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
371	print $out "*** in ", $self->{'phinddir'}, "\n";
372	}
373
374	print STDERR "<Stage name='Phind'>\n" if $gli;
375
376	# Construct phind indexes
377	my $suffixmode = $self->{'suffixmode'};
378	my $min_occurs = $self->{'min_occurs'};
379	my ($command, $status);
380
381	# Generate the vocabulary, symbol statistics, and numbers file
382	# from the clauses file
383	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
384	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
385	&extract_vocabulary($self);
386
387	# Use the suffix program to generate the phind/phrases file
388	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
389	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
390	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
391
392	# check that we generated some files. It's not necessarily an error if
393	# we didn't (execute() would have quit on error), but we can't go on.
394	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
395	if (! -r $phrasesfile) {
396	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
397	print $out "\nNo phrases found for Phind classifier!\n";
398	return;
399	}
400
401	# Create the phrase file and put phrase numbers in phind/phrases
402	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
403	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
404	&renumber_phrases($self);
405
406	print $out "\nCreating phrase databases\n";
407	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
408	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
409	my $mg_stem = &util::filename_cat($phinddir, "pdata");
410
411	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
412	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
413	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
414
415	# create the mg index of words
416	print $out "\nCreating word-level search indexes\n";
417	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
418	$mg_input = &util::filename_cat($phinddir, "pword.txt");
419	$mg_stem = &util::filename_cat($phinddir, "pword");
420
421	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
422	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
423	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
424	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
425	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
427
428	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
429	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
430	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
431
432	# create the mg document information database
433	print $out "\nCreating document information databases\n";
434	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
435	$mg_input = &util::filename_cat($phinddir, "docs.txt");
436	$mg_stem = &util::filename_cat($phinddir, "docs");
437
438	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
439	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
440	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
441
442	my $parameters = "phindnumber=$self->{'phindnumber'}";
443	if (defined ($self->{'textorientation'})) {
444	$parameters .= ";textorientation=$self->{'textorientation'}";
445	}
446	# Return the information about the classifier that we'll later want to
447	# use to create macros when the Phind classifier document is displayed.
448	my %classifyinfo = ('thistype'=>'Invisible',
449	'childtype'=>'Phind',
450	'Title'=>$self->{'buttonname'},
451	'parameters'=>$parameters,
452	'contains'=>[]);
453
454	my $collection = $self->{'collection'};
455	my $url = "library?a=p&p=phind&c=$collection";
456	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
457
458	return \%classifyinfo;
459	}
460
461
462
463	sub convert_gml_to_tokens {
464
465	my ($language_exp, $text) = @_;
466
467	# escape any magic words... - jrm21
468	foreach my $delim (@delimiters) {
469	my $replacement=lc($delim);
470	my $num= $text=~ s/$delim/$replacement/g;
471	if (!$num) {$num=0;}
472	}
473
474	if ($language_exp =~ /^en$/) {
475	return &convert_gml_to_tokens_EN($text);
476	}
477
478	if ($language_exp =~ /zh/) {
479	return &convert_gml_to_tokens_ZH($text);
480	}
481
482	$_ = $text;
483
484	# 1. remove GML tags
485
486	# Remove everything that is in a tag
487	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
488	s/\s<br>\s/ LINEBREAK /isgo;
489	s/<[^>]*>/ /sgo;
490
491	# Now we have the text, but it may contain HTML
492	# elements coded as > etc. Remove these tags.
493	s/&/&/sgo;
494	s/</</sgo;
495	s/>/>/sgo;
496	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
497	s/\s<br>\s/ LINEBREAK /isgo;
498	s/<[^>]*>/ /sgo;
499
500	# replace<p> and <br> placeholders with clause break symbol (\n)
501	s/\s+/ /gso;
502	s/PARAGRAPHBREAK/\n/sgo;
503	s/LINEBREAK/\n/sgo;
504
505
506	# 2. Split the remaining text into space-delimited tokens
507
508	# Convert any HTML special characters (like ") to their UTF8 equivalent
509	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
510
511	# Split text at word boundaries
512	s/\b/ /go;
513
514	# 3. Convert the remaining text to "clause format"
515
516	# Insert newline if the end of a sentence is detected
517	# (delimter is: "[\.\?\!]\s")
518	# s/\s*[\.\?\!]\s+/\n/go;
519
520	# remove unnecessary punctuation and replace with clause break symbol (\n)
521	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
522	#s/[^\w ]/\n/go;
523	# replace punct with new lines - is this what we want??
524	s/\s[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s/\n/go; #"
525	# then remove other punct with space
526	s/[\'\`\\\_]/ /go;
527
528	# remove extraneous whitespace
529	s/ +/ /sgo;
530	s/^\s+//mgo;
531	s/\s*$/\n/mgo;
532
533	# remove lines that contain one word or less
534	s/^\S*$//mgo;
535	s/^\s*$//mgo;
536	tr/\n//s;
537
538	return $_;
539	}
540
541	# a chinese version
542	sub convert_gml_to_tokens_ZH {
543
544	$_ = shift @_;
545
546	# Replace all whitespace with a simple space
547	s/\s+/ /gs;
548	# Remove everything that is in a tag
549	s/\s<p>\s/ PARAGRAPHBREAK /isg;
550	s/\s<br>\s/ LINEBREAK /isg;
551	s/<[^>]*>/ /sg;
552
553	# Now we have the text, but it may contain HTML
554	# elements coded as > etc. Remove these tags.
555	s/</</sg;
556	s/>/>/sg;
557
558	s/\s+/ /sg;
559	s/\s<p>\s/ PARAGRAPHBREAK /isg;
560	s/\s<br>\s/ LINEBREAK /isg;
561	s/<[^>]*>/ /sg;
562
563	# remove & and other miscellaneous markup tags
564	s/&/&/sg;
565	s/</</sg;
566	s/>/>/sg;
567	s/&/&/sg;
568
569	# replace<p> and <br> placeholders with carriage returns
570	s/PARAGRAPHBREAK/\n/sg;
571	s/LINEBREAK/\n/sg;
572
573
574	# print STDERR "text:$_\n";
575	return $_;
576	}
577
578	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
579
580	sub convert_gml_to_tokens_EN {
581	$_ = shift @_;
582
583	# FIRST, remove GML tags
584
585	# Replace all whitespace with a simple space
586	s/\s+/ /gs;
587
588	# Remove everything that is in a tag
589	s/\s<p>\s/ PARAGRAPHBREAK /isg;
590	s/\s<br>\s/ LINEBREAK /isg;
591	s/<[^>]*>/ /sg;
592
593	# Now we have the text, but it may contain HTML
594	# elements coded as > etc. Remove these tags.
595	s/</</sg;
596	s/>/>/sg;
597
598	s/\s+/ /sg;
599	s/\s<p>\s/ PARAGRAPHBREAK /isg;
600	s/\s<br>\s/ LINEBREAK /isg;
601	s/<[^>]*>/ /sg;
602
603	# remove & and other miscellaneous markup tags
604	s/&/&/sg;
605	s/</</sg;
606	s/>/>/sg;
607	s/&/&/sg;
608
609	# replace<p> and <br> placeholders with carriage returns
610	s/PARAGRAPHBREAK/\n/sg;
611	s/LINEBREAK/\n/sg;
612
613
614	# Exceptional punctuation
615	#
616	# We make special cases of some punctuation
617
618	# remove any apostrophe that indicates omitted letters
619	s/(\w+)\'(\w*\s)/ $1$2 /g;
620
621	# remove period that appears in a person's initals
622	s/\s([A-Z])\./ $1 /g;
623
624	# replace hyphens in hypheanted words and names with a space
625	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
626
627	# Convert the remaining text to "clause format",
628	# This means removing all excess punctuation and garbage text,
629	# normalising valid punctuation to fullstops and commas,
630	# then putting one cluse on each line.
631
632	# Insert newline when the end of a sentence is detected
633	# (delimter is: "[\.\?\!]\s")
634	s/\s*[\.\?\!]\s+/\n/g;
635
636	# split numbers after four digits
637	s/(\d\d\d\d)/$1 /g;
638
639	# split words after 32 characters
640
641	# squash repeated punctuation
642	tr/A-Za-z0-9 //cs;
643
644	# save email addresses
645	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
646
647	# normalise clause breaks (mostly punctuation symbols) to commas
648	s/[^A-Za-z0-9 \n]+/ , /g;
649
650	# Remove repeated commas, and replace with newline
651	s/\s*,[, ]+/\n/g;
652
653	# remove extra whitespace
654	s/ +/ /sg;
655	s/^\s+//mg;
656	s/\s*$/\n/mg;
657
658	# remove lines that contain one word or less
659	s/^\w*$//mg;
660	s/^\s*$//mg;
661	tr/\n//s;
662
663	return $_;
664
665	}
666
667
668
669	# Execute a system command
670
671	sub execute {
672	my ($command, $verbosity, $outhandle) = @_;
673	print $outhandle "Executing: $command\n" if ($verbosity > 2);
674	$! = 0;
675	my $status = system($command);
676	if ($status != 0) {
677	print STDERR "Phind - Error executing '$command': $!\n";
678	exit($status); # this causes the build to fail...
679	}
680	}
681
682
683	# Generate the vocabulary, symbol statistics, and numbers file from the
684	# clauses file. This is legacy code, so is a bit messy and probably wont
685	# run under windows.
686
687	sub extract_vocabulary {
688	my ($self) = @_;
689
690	my $verbosity = $self->{'verbosity'};
691	my $out = $self->{'outhandle'};
692
693	my $collectiondir = $self->{'collectiondir'};
694	my $phinddir = $self->{'phinddir'};
695
696	my $language_exp = $self->{'language'};
697
698	my ($w, $l, $line, $word);
699
700	my ($first_delimiter, $last_delimiter,
701	$first_stopword, $last_stopword,
702	$first_extractword, $last_extractword,
703	$first_contentword, $last_contentword,
704	$phrasedelimiter);
705
706	my $thesaurus = $self->{'thesaurus'};
707	my ($thesaurus_links, $thesaurus_terms,
708	%thesaurus, $first_thesaurusword, $last_thesaurusword);
709
710	my %symbol;
711	my (%freq);
712
713	print $out "Calculating vocabulary\n" if ($verbosity > 1);
714
715	# Read and store the stopwords
716	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
717	my $stopword_files = ();
718	my ($language, $language_dir, $file, $file_name);
719	my %stopwords;
720
721	# Examine each directory in the stopword directory
722	opendir(STOPDIR, $stopdir);
723	foreach $language (readdir STOPDIR) {
724
725	# Ignore entries that do not match the classifier's language
726	next unless ($language =~ /$language_exp/);
727	$language_dir = &util::filename_cat($stopdir, $language);
728	next unless (-d "$language_dir");
729
730	opendir(LANGDIR, $language_dir);
731	foreach $file (readdir LANGDIR) {
732
733	# Ignore entries that are not stopword files
734	next unless ($file =~ /sw$/);
735	$file_name = &util::filename_cat($language_dir, $file);
736	next unless (-f "$file_name");
737
738	# Read the stopwords
739	open(STOPFILE, "<$file_name");
740	while (<STOPFILE>) {
741	s/^\s+//;
742	s/\s.*//;
743	$word = $_;
744	$l = lc($word);
745	$stopwords{$l} = $word;
746	}
747	close STOPFILE;
748
749	}
750	closedir LANGDIR;
751	}
752	closedir STOPDIR;
753
754	# Read thesaurus information
755	if ($thesaurus) {
756
757	# link file exists
758	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
759	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
760
761	# ensure term file exists in the correct language
762	if ($language_exp =~ /^([a-z][a-z])/) {
763	$language = $1;
764	} else {
765	$language = 'en';
766	}
767	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
768	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
769
770
771	# Read the thesaurus terms
772	open(TH, "<$thesaurus_terms");
773	while(<TH>) {
774	s/^\d+ //;
775	s/\(.*\)//;
776	foreach $w (split(/\s+/, $_)) {
777	$thesaurus{lc($w)} = $w;
778	}
779	}
780	close TH;
781	}
782
783	# Read words in the text and count occurences
784	open(TXT, "<$phinddir/clauses");
785
786	my @words;
787	while(<TXT>) {
788	$line = $_;
789	next unless ($line =~ /./);
790
791	@words = split(/\s+/, $line);
792	foreach $w (@words) {
793	$l = lc($w);
794	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
795	$freq{$w}++;
796	}
797	$freq{$senlimit}++;
798	}
799
800	close TXT;
801
802	# Calculate the "best" form of each word
803	my (%bestform, %totalfreq, %bestfreq);
804
805	foreach $w (sort (keys %freq)) {
806	$l = lc($w);
807
808	# totalfreq is the number of times a term appears in any form
809	$totalfreq{$l} += $freq{$w};
810
811	if (defined $stopwords{$l}) {
812	$bestform{$l} = $stopwords{$l};
813
814	} elsif (defined $thesaurus{$l}) {
815	$bestform{$l} = $thesaurus{$l};
816
817	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
818	$bestfreq{$l} = $freq{$w};
819	$bestform{$l} = $w;
820	}
821	}
822	undef %freq;
823	undef %bestfreq;
824
825
826	# Assign symbol numbers to tokens
827	my $nextsymbol = 1;
828	my (@vocab);
829
830	# Delimiters
831	$first_delimiter = 1;
832
833	foreach $word (@delimiters) {
834
835	# $word = lc($word); # jrm21
836	$word = uc($word);
837	$bestform{$word} = $word;
838	$vocab[$nextsymbol] = $word;
839	$symbol{$word} = $nextsymbol;
840	$nextsymbol++;
841	}
842	$last_delimiter = $nextsymbol - 1;
843	# Stopwords
844	$first_stopword = $nextsymbol;
845
846	foreach my $word (sort keys %stopwords) {
847	# don't include stopword unless it occurs in the text
848	$word = lc($word);
849	next unless ($totalfreq{$word});
850	next if ($symbol{$word});
851
852	$vocab[$nextsymbol] = $word;
853	$symbol{$word} = $nextsymbol;
854	$nextsymbol++;
855	}
856	$last_stopword = $nextsymbol - 1;
857	$first_contentword = $nextsymbol;
858
859	# Thesaurus terms
860	if ($thesaurus) {
861	$first_thesaurusword = $nextsymbol;
862
863	foreach my $word (sort keys %thesaurus) {
864
865	$word = lc($word);
866	next if ($symbol{$word});
867	$bestform{$word} = $thesaurus{$word};
868
869	$vocab[$nextsymbol] = $word;
870	$symbol{$word} = $nextsymbol;
871	$nextsymbol++;
872
873	}
874	$last_thesaurusword = $nextsymbol - 1;
875	}
876
877	# Other content words
878	$first_extractword = $nextsymbol;
879
880	foreach my $word (sort (keys %bestform)) {
881
882	next if ($symbol{$word});
883
884	$vocab[$nextsymbol] = $word;
885	$symbol{$word} = $nextsymbol;
886	$nextsymbol++;
887	}
888	$last_extractword = $nextsymbol - 1;
889	$last_contentword = $nextsymbol - 1;
890
891	# Outut the words
892	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
893	open(VOC, ">$phinddir/clauses.vocab");
894
895	for (my $i = 1; $i < $nextsymbol; $i++) {
896	$w = $vocab[$i];
897
898	print VOC "$bestform{$w}\n";
899	$totalfreq{$w} = 0 unless ($totalfreq{$w});
900	}
901	close VOC;
902
903
904	# Create statistics file
905	# Output statistics about the vocablary
906	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
907	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
908
909	open(STAT, ">$phinddir/clauses.stats")
910	\|\| die "Cannot open $phinddir/clauses.stats: $!";
911
912	print STAT "first_delimiter $first_delimiter\n";
913	print STAT "last_delimiter $last_delimiter\n";
914	print STAT "first_stopword $first_stopword\n";
915	print STAT "last_stopword $last_stopword\n";
916	if ($thesaurus) {
917	print STAT "first_thesaurusword $first_thesaurusword\n";
918	print STAT "last_thesaurusword $last_thesaurusword\n";
919	}
920	print STAT "first_extractword $first_extractword\n";
921	print STAT "last_extractword $last_extractword\n";
922	print STAT "first_contentword $first_contentword\n";
923	print STAT "last_contentword $last_contentword\n";
924	print STAT "first_symbol $first_delimiter\n";
925	print STAT "last_symbol $last_contentword\n";
926	print STAT "first_word $first_stopword\n";
927	print STAT "last_word $last_contentword\n";
928	close STAT;
929
930	undef @vocab;
931
932
933	# Create numbers file
934	# Save text as symbol numbers
935	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
936
937	open(TXT, "<$phinddir/clauses");
938	open(NUM, ">$phinddir/clauses.numbers");
939
940	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
941	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
942	$phrasedelimiter = $symbol{$senlimit};
943	print NUM "$symbol{$colstart}\n";
944
945	# set up the special symbols that delimit documents and sentences
946	while(<TXT>) {
947
948	# split sentence into a list of tokens
949	$line = $_;
950	next unless ($line =~ /./);
951	@words = split(/\s+/, $line);
952
953	# output one token at a time
954	foreach $word (@words) {
955	# don't lower-case special delimiters - jrm21
956	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
957	$word = lc($word);
958	}
959	print NUM "$symbol{$word}\n";
960	}
961
962	# output phrase delimiter
963	print NUM "$phrasedelimiter\n";
964	}
965
966	close TXT;
967	# print NUM "$symbol{lc($colend)}\n";# jrm21
968	print NUM "$symbol{$colend}\n";
969	close NUM;
970
971	# Save thesaurus data in one convienient file
972	if ($thesaurus) {
973
974	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
975
976
977	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
978	if ($verbosity > 1);
979
980	# Read the thesaurus terms
981	my ($num, $text, %thes_symbols);
982
983	open(TH, "<$thesaurus_terms");
984	while(<TH>) {
985	chomp;
986	@words = split(/\s+/, $_);
987	$num = shift @words;
988	$text = "";
989
990	# translate words into symbol numbers
991	foreach $word (@words) {
992	$word = lc($word);
993	if ($symbol{$word}) {
994	$text .= "s$symbol{$word} ";
995	} elsif ($verbosity) {
996	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
997	}
998	}
999	$text =~ s/ $//;
1000	$thes_symbols{$num} = $text;
1001	}
1002	close TH;
1003
1004	# Read the thesaurus links and write the corresponding data
1005	open(TH, "<$thesaurus_links");
1006	open(THOUT, ">$thesaurusfile");
1007
1008	while(<TH>) {
1009	chomp;
1010	($num, $text) = split(/:/, $_);
1011
1012	if (defined($thes_symbols{$num})) {
1013	print THOUT "$num:$thes_symbols{$num}:$text\n";
1014	} else {
1015	print THOUT "$num:untranslated:$text\n";
1016	}
1017	}
1018	close TH;
1019	close THOUT;
1020	}
1021
1022
1023
1024
1025	}
1026
1027
1028	# renumber_phrases
1029	#
1030	# Prepare the phrases file to be input to mgpp. The biggest problem is
1031	# reconciling the phrase identifiers used by the suffix program (which
1032	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1033	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1034	# Phind-id numbers must be sorted by frequency of occurance.
1035	#
1036	# Start creating a set of phind-id numbers from the sorted suffix-id
1037	# numbers and (if required) the thesaurus-id numbers. Then add any other
1038	# phrases occuring in the thesaurus.
1039	#
1040	# The last thing we have to do is restore the vocabulary information to the
1041	# phrase file so that the phrases are stored as words, not as symbol
1042	# numbers.
1043
1044	# The original phrases file looks something like this:
1045	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1046	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1047	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1048	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1049
1050
1051	sub renumber_phrases {
1052	my ($self) = @_;
1053
1054	renumber_suffix_data($self);
1055	renumber_thesaurus_data($self);
1056	restore_vocabulary_data($self);
1057
1058	}
1059
1060
1061
1062	# renumber_suffix_data
1063	#
1064	# Translate phrases file to phrases.2 using phind keys instead
1065	# of suffix keys and sorting the expansion data.
1066
1067	sub renumber_suffix_data {
1068	my ($self) = @_;
1069
1070	my $verbosity = $self->{'verbosity'};
1071	my $out = $self->{'outhandle'};
1072	print $out "Translate phrases: suffix-ids become phind-id's\n"
1073	if ($verbosity);
1074
1075	my $phinddir = $self->{'phinddir'};
1076	my $infile = &util::filename_cat($phinddir, 'phrases');
1077	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1078
1079	# Read the phrase file. Calculate initial set of phind-id
1080	# numbers and store (suffixid -> frequency) relation.
1081
1082	my %suffixtophind;
1083	my @totalfrequency;
1084	my (@fields, $suffixid);
1085	my $nextphind = 1;
1086
1087	open(IN, "<$infile");
1088	while(<IN>) {
1089
1090	chomp;
1091	@fields = split(/:/, $_);
1092
1093	# get next suffixid and phindid
1094	$suffixid = shift @fields;
1095	$suffixtophind{$suffixid} = $nextphind;
1096
1097	# store total frequency
1098	shift @fields;
1099	$totalfrequency[$nextphind] = shift @fields;
1100
1101	$nextphind++;
1102	}
1103	close IN;
1104
1105
1106	# Translate phrases file to phrases.2. Use phind keys (not suffix
1107	# keys), sort expansion and document occurance data in order of
1108	# descending frequency..
1109	open(IN, "<$infile");
1110	open(OUT, ">$outfile");
1111
1112	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1113	my (@documwents, @newexp, $k, $n);
1114	my $linenumber = 0;
1115
1116	while(<IN>) {
1117
1118	# read the line
1119	chomp;
1120	@fields = split(/:/, $_);
1121
1122	# get a phrase number for this line
1123	$suffixid = shift @fields;
1124	die unless (defined($suffixtophind{$suffixid}));
1125	$phindid = $suffixtophind{$suffixid};
1126
1127	# get the symbols in the phrase
1128	$text = shift @fields;
1129
1130	# output status information
1131	$linenumber++;
1132	if ($verbosity > 2) {
1133	if ($linenumber % 1000 == 0) {
1134	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1135	}
1136	# what are $num and $key??
1137	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1138	}
1139
1140	# get the phrase frequency
1141	$tf = shift @fields;
1142
1143	# get the number of expansions
1144	$countexp = shift @fields;
1145
1146	# get the expansions, convert them into phind-id numbers, and sort them
1147	$expansions = shift @fields;
1148	@newexp = ();
1149	foreach $k (split(/,/, $expansions)) {
1150	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1151	$n = $suffixtophind{$k};
1152	push @newexp, $n;
1153	}
1154	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1155
1156	# get the number of documents
1157	$countdocs = shift @fields;
1158
1159	# get the documents and sort them
1160	$documents = shift @fields;
1161	$documents =~ s/d//g;
1162	my @documents = split(/;/, $documents);
1163	@documents = sort by_doc_frequency @documents;
1164
1165	# output the phrase data
1166	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1167	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1168
1169	}
1170
1171	close IN;
1172	close OUT;
1173	}
1174
1175
1176	# renumber_thesaurus_data
1177	#
1178	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1179
1180	sub renumber_thesaurus_data {
1181	my ($self) = @_;
1182
1183	my $out = $self->{'outhandle'};
1184	my $verbosity = $self->{'verbosity'};
1185	my $thesaurus = $self->{'thesaurus'};
1186
1187	my $phinddir = $self->{'phinddir'};
1188	my $infile = &util::filename_cat($phinddir, "phrases.2");
1189	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1190
1191
1192	# If no thesaurus is defined, simply move the phrases file.
1193	if (!$thesaurus) {
1194	print $out "Translate phrases.2: no thesaurus data\n"
1195	if ($verbosity);
1196	&util::mv($infile, $outfile);
1197	return;
1198	}
1199
1200	print $out "Translate phrases.2: add thesaurus data\n"
1201	if ($verbosity);
1202
1203	# 1.
1204	# Read thesaurus file and store (symbols->thesaurusid) mapping
1205	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1206	my %symbolstothesid;
1207	my (@fields, $thesid, $symbols);
1208
1209	open(TH, "<$thesaurusfile");
1210
1211	while (<TH>) {
1212
1213	chomp;
1214	@fields = split(/:/, $_);
1215
1216	# get id and text
1217	$thesid = shift @fields;
1218	$symbols = shift @fields;
1219	$symbolstothesid{$symbols} = $thesid;
1220	}
1221	close TH;
1222
1223	# 2.
1224	# Read phrases file to find thesaurus entries that already
1225	# have a phindid. Store their phind-ids for later translation,
1226	# and store their frequency for later sorting.
1227	my %thesaurustophindid;
1228	my %phindidtofrequency;
1229	my ($phindid, $freq);
1230
1231	open(IN, "<$infile");
1232
1233	while(<IN>) {
1234
1235	chomp;
1236	@fields = split(/:/, $_);
1237
1238	# phindid and symbols for this line
1239	$phindid = shift @fields;
1240	$symbols = shift @fields;
1241	$freq = shift @fields;
1242
1243	# do we have a thesaurus id corresponding to this phrase?
1244	if (defined($symbolstothesid{$symbols})) {
1245	$thesid = $symbolstothesid{$symbols};
1246	$thesaurustophindid{$thesid} = $phindid;
1247	$phindidtofrequency{$phindid} = $freq;
1248	}
1249	}
1250	close IN;
1251
1252	undef %symbolstothesid;
1253
1254	# 3.
1255	# Create phind-id numbers for remaining thesaurus entries,
1256	# and note that their frequency is 0 for later sorting.
1257	my $nextphindid = $phindid + 1;
1258
1259	open(TH, "<$thesaurusfile");
1260	while(<TH>) {
1261
1262	chomp;
1263	@fields = split(/:/, $_);
1264
1265	# read thesaurus-id and ensure it has a corresponding phind-id
1266	$thesid = shift @fields;
1267	if (!defined($thesaurustophindid{$thesid})) {
1268	$thesaurustophindid{$thesid} = $nextphindid;
1269	$phindidtofrequency{$nextphindid} = 0;
1270	$nextphindid++;
1271	}
1272	}
1273	close TH;
1274
1275	# 4.
1276	# Translate thesaurus file, replacing thesaurus-id numbers with
1277	# phind-id numbers.
1278	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1279	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1280	my (@links, $linkid, %linkidtotype, $newrelation);
1281
1282	open(TH, "<$thesaurusfile");
1283	open(TO, ">$newthesaurusfile");
1284	while(<TH>) {
1285
1286	chomp;
1287	@fields = split(/:/, $_);
1288
1289	# phindid and symbols for this line
1290	($thesid, $symbols, $relations) = @fields;
1291
1292	die unless ($thesid && $symbols);
1293	die unless $thesaurustophindid{$thesid};
1294	$phindid = $thesaurustophindid{$thesid};
1295
1296	# convert each part of the relation string to use phind-id numbers
1297	# at the same time, we want to sort the list by frequency.
1298	undef %linkidtotype;
1299
1300	foreach $linktext (split(/;/, $relations)) {
1301	@linkdata = split(/,/, $linktext);
1302
1303	# remember the linktype (e.g. BT, NT)
1304	$linktype = shift @linkdata;
1305
1306	# store the type of each link
1307	foreach $thesid (@linkdata) {
1308	die unless (defined($thesaurustophindid{$thesid}));
1309	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1310	}
1311	}
1312
1313	# sort the list of links, first by frequency, then by type.
1314	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1315	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1316	$linkcounter = (scalar @links);
1317
1318	# create a string describing the link information
1319	$linktype = $linkidtotype{$links[0]};
1320	$newrelation = $linktype;
1321	foreach $linkid (@links) {
1322	if ($linkidtotype{$linkid} ne $linktype) {
1323	$linktype = $linkidtotype{$linkid};
1324	$newrelation .= ";" . $linktype;
1325	}
1326	$newrelation .= "," . $linkid;
1327	}
1328	$newrelation .= ";";
1329
1330
1331	# output the new line
1332	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1333	}
1334	close TH;
1335	close TO;
1336
1337	undef %thesaurustophindid;
1338	undef %linkidtotype;
1339	undef %phindidtofrequency;
1340
1341	# 5.
1342	# Read thesaurus data (in phind-id format) into memory
1343	my %thesaurusdata;
1344
1345	open(TH, "<$newthesaurusfile");
1346	while(<TH>) {
1347	chomp;
1348	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1349	die unless ($phindid && $symbols);
1350	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1351	}
1352	close TH;
1353
1354	# 6.
1355	# Add thesaurus data to phrases file
1356	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1357	my (@documwents, @newexp, $k, $n);
1358	my $linenumber = 0;
1359
1360	open(IN, "<$infile");
1361	open(OUT, ">$outfile");
1362
1363	# Update existing phrases
1364	while(<IN>) {
1365
1366	chomp;
1367	@fields = split(/:/, $_);
1368
1369	# get data for this line
1370	$phindid = shift @fields;
1371
1372	# output the phrase data, with thesaurus information
1373	print OUT "$phindid:", join(":", @fields);
1374
1375	# add thesaurus data
1376	if (defined($thesaurusdata{$phindid})) {
1377	@fields = split(/:/, $thesaurusdata{$phindid});
1378	shift @fields;
1379	$linkcounter = shift @fields;
1380	$relations = shift @fields;
1381
1382	print OUT ":$linkcounter:$relations";
1383	$thesaurusdata{$phindid} = "";
1384	}
1385	print OUT "\n";
1386	}
1387	close IN;
1388
1389	# Add phrases that aren't already in the file
1390	foreach $phindid (sort numerically keys %thesaurusdata) {
1391	next unless ($thesaurusdata{$phindid});
1392
1393	@fields = split(/:/, $thesaurusdata{$phindid});
1394	$symbols = shift @fields;
1395	$linkcounter = shift @fields;
1396	$relations = shift @fields;
1397
1398	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1399	}
1400	close OUT;
1401
1402	}
1403
1404	# restore_vocabulary_data
1405	#
1406	# Read phrases.3 and restore vocabulary information. Then write
1407	# this data to the MGPP input files (pword.txt and pdata.txt) and
1408	# (if requested) to the saved phrases file.
1409
1410	sub restore_vocabulary_data {
1411	my ($self) = @_;
1412
1413	my $out = $self->{'outhandle'};
1414	my $verbosity = $self->{'verbosity'};
1415	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1416
1417	my $phinddir = $self->{'phinddir'};
1418	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1419	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1420	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1421	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1422
1423	my $savephrases = $self->{'savephrases'};
1424
1425	# 1.
1426	# Read the vocabulary file
1427	open(V, "<$vocabfile")
1428	\|\| die "Cannot open $vocabfile: $!";
1429	my @symbol;
1430	my $i = 1;
1431	while(<V>) {
1432	chomp;
1433	$symbol[$i++] = $_;
1434	}
1435	close V;
1436
1437	# 2.
1438	# Translate phrases.3 to MGPP input files
1439	my ($key, $text, $word, $isThesaurus, $line);
1440	my @fields;
1441	my $linenumber = 0;
1442
1443	open(IN, "<$infile");
1444	open(DATA, ">$datafile");
1445	open(WORD, ">$wordfile");
1446
1447	# Save the phrases in a separate text file
1448	if ($savephrases) {
1449	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1450	open(SAVE, ">$savephrases");
1451	}
1452
1453	while(<IN>) {
1454
1455	# read the line
1456	chomp;
1457	$line = $_;
1458	@fields = split(/:/, $line);
1459
1460	# get a phrase number for this line
1461	$key = shift @fields;
1462
1463	# restore the text of the phrase
1464	$text = shift @fields;
1465	$text =~ s/s(\d+)/$symbol[$1]/g;
1466	if ($text =~ / /) {
1467	$word = "";
1468	} elsif ($text ne 'untranslated') {
1469	$word = $text;
1470	}
1471
1472	# output the phrase data
1473	print DATA "<Document>";
1474	print DATA "$key:$text:", join(":", @fields), ":\n";
1475
1476	# output the word index search data
1477	print WORD "<Document>$word\n";
1478
1479	# output the phrases to a text file
1480	if ($savephrases) {
1481	if ((scalar @fields) == 7) {
1482	$isThesaurus = 1;
1483	} else {
1484	$isThesaurus = 0;
1485	}
1486	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1487	}
1488	}
1489	close IN;
1490	close WORD;
1491	close DATA;
1492	close SAVE if ($savephrases);
1493
1494	}
1495
1496
1497
1498	# sort routines used to renumber phrases
1499
1500	sub numerically { $a <=> $b }
1501
1502	sub by_doc_frequency {
1503	my $fa = 1;
1504	if ($a =~ /,/) {
1505	$fa = $a;
1506	$fa =~ s/\d+,//;
1507	}
1508	my $fb = 1;
1509	if ($b =~ /,/) {
1510	$fb = $b;
1511	$fb =~ s/\d+,//;
1512	}
1513
1514	return ($fb <=> $fa);
1515	}
1516
1517	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: