Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 6408

Last change on this file since 6408 was 6408, checked in by jmt12, 20 years ago
Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)
Property svn:keywords set to `Author Date Id Revision`
File size: 41.9 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BasClas;
37	use util;
38	use ghtml;
39	use unicode;
40
41	my @removedirs = ();
42
43	my %wanted_index_files = ('td'=>1,
44	't'=>1,
45	'ti'=>1,
46	'tl'=>1,
47	'tsd'=>1,
48	'idb'=>1,
49	'ib1'=>1,
50	'ib2'=>1,
51	'ib3'=>1,
52	'i'=>1,
53	'il'=>1,
54	'w'=>1,
55	'wa'=>1);
56
57	sub BEGIN {
58	@ISA = ('BasClas');
59	}
60
61	sub END {
62
63	# Tidy up stray files - we do this here as there's some weird problem
64	# preventing us from doing it in the get_classify_info() function (on
65	# windows at least) where the close() appears to fail on txthandle and
66	# dochandle, thus preventing us from deleting those files
67
68	foreach my $dir (@removedirs) {
69	if (-d $dir && opendir (DIR, $dir)) {
70	my @files = readdir DIR;
71	closedir DIR;
72
73	foreach $file (@files) {
74	next if $file =~ /^\.\.?$/;
75	my ($suffix) = $file =~ /\.([^\.]+)$/;
76	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
77	# delete it!
78	&util::rm (&util::filename_cat ($dir, $file));
79	}
80	}
81	}
82	}
83	}
84
85	my $arguments =
86	[ { 'name' => "text",
87	'desc' => "{Phind.text}",
88	'type' => "string",
89	'deft' => "section:Title,section:text",
90	'reqd' => "no" },
91	{ 'name' => "title",
92	'desc' => "{Phind.title}",
93	'type' => "metadata",
94	'deft' => "Title",
95	'reqd' => "no" },
96	{ 'name' => "button",
97	'desc' => "{Phind.button}",
98	'type' => "string",
99	'deft' => "Phrase",
100	'reqd' => "no" },
101	{ 'name' => "language",
102	'desc' => "{Phind.language}",
103	'type' => "language",
104	'deft' => "en",
105	'reqd' => "no" },
106	{ 'name' => "savephrases",
107	'desc' => "{Phind.savephrases}",
108	'type' => "string",
109	'deft' => "",
110	'reqd' => "no" },
111	{ 'name' => "builddir",
112	'desc' => "{BasClas.builddir}",
113	'type' => "string",
114	'deft' => "" },
115	{ 'name' => "suffixmode",
116	'desc' => "{Phind.suffixmode}",
117	'type' => "int",
118	'deft' => "1",
119	'reqd' => "no" },
120	{ 'name' => "thesaurus",
121	'desc' => "{Phind.thesaurus}",
122	'type' => "string",
123	'deft' => "",
124	'reqd' => "no" },
125	{ 'name' => "untidy",
126	'desc' => "{Phind.untidy}",
127	'type' => "flag",
128	'reqd' => "no" } ];
129
130	my $options = { 'name' => "Phind",
131	'desc' => "{Phind.desc}",
132	'abstract' => "no",
133	'inherits' => "yes",
134	'args' => $arguments };
135
136	# sub print_usage {
137	# print STDERR "
138	# usage: classify Phind [options]
139
140	# options:
141	# -text Fields The text used to build the phrase hierarchy.
142	# (default: 'section:Title,section:text')
143
144	# -title Title The metadata field used to describe each document.
145	# (default: 'Title')
146
147	# -button Name The label for the classifier screen and button in
148	# navigation bar.
149	# (default: 'Phrase')
150
151	# -language Regex Language or languages to use building hierarchy.
152	# Languages are identified by two-letter country codes
153	# like en (English), es (Spanish), and fr (French).
154	# Language is a regular expression, so 'en\|fr' (English or
155	# French) and '..' (match any language) are valid.
156	# (default: 'en'.)
157
158	# -savephrases File If set, the phrase infomation will be stored in
159	# the given file as text. It is probably a good idea
160	# to use an absolute path.
161	# (default: not set)
162
163	# -suffixmode N The smode parameter to the phrase extraction program. A
164	# value of 0 means that stopwords are ignored, and of 1
165	# means that stopwords are used.
166	# (default: 1)
167
168	# -thesaurus Name Name of a thesaurus stored in Phind format in the
169	# collection's etc directory.
170	# (default: not set)
171
172	# -untidy Don't remove working files.
173
174	# "; }
175
176	# Phrase delimiter symbols - these should be abstracted out someplace
177
178	my $colstart = "COLLECTIONSTART";
179	my $colend = "COLLECTIONEND";
180	my $doclimit = "DOCUMENTLIMIT";
181	my $senlimit = "SENTENCELIMIT";
182	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
183
184
185	# Create a new Phind browser based on collect.cfg
186
187	sub new {
188	my $class = shift (@_);
189	my $self = new BasClas($class, @_);
190
191	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
192	my $option_list = $self->{'option_list'};
193	push( @{$option_list}, $options );
194
195	my $out = $self->{'outhandle'};
196
197	# Ensure the Phind generate scripts are in place
198	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
199	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
200	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
201
202	if (!(-e $file1)) {
203	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
204	exit(1);
205	}
206
207	# Parse classifier arguments
208	my $builddir = "";
209	if (!parsargv::parse(\@_,
210	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
211	q^title/.*/Title^, \$self->{'titlefield'},
212	q^button/.*/Phrase^, \$self->{'buttonname'},
213	q^language/.*/en^, \$language,
214	q^builddir/.*/^, \$builddir,
215	q^savephrases/.*/^, \$self->{'savephrases'},
216	q^suffixmode/\d/1^, \$self->{'suffixmode'},
217	q^thesaurus/.*/^, \$self->{'thesaurus'},
218	q^untidy^, \$self->{'untidy'},
219	"allow_extra_options")) {
220
221	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
222	$self->print_txt_usage(""); # Use default resource bundle
223	die "\n";
224	}
225
226	# classifier information
227	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
228
229	# limit languages
230	$self->{'language_exp'} = $language;
231
232	# collection directories
233	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
234	if (!$builddir) {
235	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
236	}
237	$self->{'builddir'} = $builddir;
238
239	$self->{'total'} = 0;
240
241	return bless $self, $class;
242	}
243
244
245	# Initialise the Phind classifier
246
247	sub init {
248	my $self = shift (@_);
249
250	# ensure we have a build directory
251	my $builddir = $self->{'builddir'};
252	die unless (-e "$builddir");
253
254	# create Phind directory
255	my $phnumber = 1;
256	my $phinddir = &util::filename_cat($builddir, "phind1");
257	while (-e "$phinddir") {
258	$phnumber++;
259	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
260	}
261	&util::mk_dir("$phinddir");
262	$self->{'phinddir'} = $phinddir;
263	$self->{'phindnumber'} = $phnumber;
264
265	push(@removedirs, $phinddir) unless $self->{'untidy'};
266
267	# open filehandles for documents and text
268	my $clausefile = &util::filename_cat("$phinddir", "clauses");
269	&util::rm($clausefile) if (-e $clausefile);
270
271	my $txthandle = 'TEXT' . $phnumber;
272	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
273	$self->{'txthandle'} = $txthandle;
274
275	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
276	&util::rm($docfile) if (-e $docfile);
277
278	my $dochandle = 'DOC' . $phnumber;
279	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
280	$self->{'dochandle'} = $dochandle;
281
282	}
283
284
285	# Classify each document.
286	#
287	# Each document is passed here in turn. The classifier extracts the
288	# text of each and stores it in the clauses file. Document details are
289	# stored in the docs.txt file.
290
291	sub classify {
292	my $self = shift (@_);
293	my $doc_obj = shift @_;
294
295	my $verbosity = $self->{'verbosity'};
296	my $top_section = $doc_obj->get_top_section();
297
298	my $titlefield = $self->{'titlefield'};
299
300	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
301	if (!defined($title)) {
302	$title = "";
303	print STDERR "Phind: document has no title\n";
304	}
305	print "process: $title\n" if ($verbosity > 2);
306
307	# Only consider the file if it is in the correct language
308	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
309	my $phrlanguage = $self->{'language_exp'};
310	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
311
312	# record this file
313	$self->{'total'} ++;
314	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
315
316
317	# Store document details
318	my $OID = $doc_obj->get_OID();
319	$OID = "NULL" unless defined $OID;
320	my $dochandle = $self->{'dochandle'};
321	print $dochandle "<Document>\t$OID\t$title\n";
322
323	# Store the text occuring in this object
324
325	# output the document delimiter
326	my $txthandle = $self->{'txthandle'};
327	print $txthandle "$doclimit\n";
328
329	# iterarate over the required indexes and store their text
330	my $indexes = $self->{'indexes'};
331	my $text = "";
332	my ($part, $level, $field, $section, $data, $dataref);
333
334	foreach $part (split(/,/, $indexes)) {
335
336	# Each field has a level and a data element ((e.g. document:Title)
337	($level, $field) = split(/:/, $part);
338	die unless ($level && $field);
339
340	# Extract the text from every section
341	# (In phind, document:text and section:text are equivalent)
342	if ($field eq "text") {
343	$data = "";
344	$section = $doc_obj->get_top_section();
345	while (defined($section)) {
346	$data .= $doc_obj->get_text($section) . "\n";
347	$section = $doc_obj->get_next_section($section);
348	}
349	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
350	}
351
352	# Extract a metadata field from a document
353	# (If there is more than one element of the given type, get them all.)
354	elsif ($level eq "document") {
355	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
356	foreach $data (@$dataref) {
357	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
358	}
359	}
360
361	# Extract metadata from every section in a document
362	elsif ($level eq "section") {
363	$data = "";
364	$section = $doc_obj->get_top_section();
365	while (defined($section)) {
366	$dataref = $doc_obj->get_metadata($section, $field);
367	$data .= join("\n", @$dataref) . "\n";
368	$section = $doc_obj->get_next_section($section);
369	}
370	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
371	}
372
373	# Some sort of specification which I don't understand
374	else {
375	die "Unknown level ($level) in Phind index ($part)\n";
376	}
377
378	}
379
380	# output the text
381	$text =~ tr/\n//s;
382	print $txthandle "$text";
383	}
384
385
386	# Construct the classifier from the information already gathered
387	#
388	# When get_classify_info is called, the clauses and docs.txt files have
389	# already been constructed in the Phind directory. This function will
390	# translate them into compressed, indexed MGPP files that can be read by
391	# the phindcgi script. It will also register our classifier so that it
392	# shows up in the navigation bar.
393
394	sub get_classify_info {
395	my $self = shift (@_);
396	my ($gli) = @_;
397
398	close $self->{'dochandle'};
399	close $self->{'txthandle'};
400	my $verbosity = $self->{'verbosity'};
401	my $out = $self->{'outhandle'};
402	my $phinddir = $self->{'phinddir'};
403
404	my $osextra = "";
405	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
406	$osextra = " -d /";
407	}
408
409	print STDERR "</Stage>\n" if $gli;
410
411	if ($verbosity) {
412	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
413	print $out "*** in ", $self->{'phinddir'}, "\n";
414	}
415
416	print STDERR "<Stage name='Phind'>\n" if $gli;
417
418	# Construct phind indexes
419	my $suffixmode = $self->{'suffixmode'};
420	my ($command, $status);
421
422	# Generate the vocabulary, symbol statistics, and numbers file
423	# from the clauses file
424	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
425	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
426	&extract_vocabulary($self);
427
428	# Use the suffix program to generate the phind/phrases file
429	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
430	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
431	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
432
433	# check that we generated some files. It's not necessarily an error if
434	# we didn't (execute() would have quit on error), but we can't go on.
435	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
436	if (! -r $phrasesfile) {
437	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
438	print $out "\nNo phrases found for Phind classifier!\n";
439	return;
440	}
441
442	# Create the phrase file and put phrase numbers in phind/phrases
443	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
444	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
445	&renumber_phrases($self);
446
447	print $out "\nCreating phrase databases\n";
448	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
449	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
450	my $mg_stem = &util::filename_cat($phinddir, "pdata");
451
452	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
453	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
454	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
455
456	# create the mg index of words
457	print $out "\nCreating word-level search indexes\n";
458	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
459	$mg_input = &util::filename_cat($phinddir, "pword.txt");
460	$mg_stem = &util::filename_cat($phinddir, "pword");
461
462	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
463	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
464	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
465	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
466	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
467	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
468
469	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
470	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
471	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
472
473	# create the mg document information database
474	print $out "\nCreating document information databases\n";
475	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
476	$mg_input = &util::filename_cat($phinddir, "docs.txt");
477	$mg_stem = &util::filename_cat($phinddir, "docs");
478
479	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
480	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
481	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
482
483	# Return the information about the classifier that we'll later want to
484	# use to create macros when the Phind classifier document is displayed.
485	my %classifyinfo = ('thistype'=>'Invisible',
486	'childtype'=>'Phind',
487	'Title'=>$self->{'buttonname'},
488	'parameters'=>"phindnumber=$self->{'phindnumber'}",
489	'contains'=>[]);
490
491	my $collection = $self->{'collection'};
492	my $url = "library?a=p&p=phind&c=$collection";
493	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
494
495	return \%classifyinfo;
496	}
497
498
499
500	sub convert_gml_to_tokens {
501
502	my ($language_exp, $text) = @_;
503
504	# escape any magic words... - jrm21
505	foreach my $delim (@delimiters) {
506	my $replacement=lc($delim);
507	my $num= $text=~ s/$delim/$replacement/g;
508	if (!$num) {$num=0;}
509	}
510
511	if ($language_exp =~ /en/) {
512	return &convert_gml_to_tokens_EN($text);
513	}
514
515	if ($language_exp =~ /zh/) {
516	return &convert_gml_to_tokens_ZH($text);
517	}
518
519	$_ = $text;
520
521	# 1. remove GML tags
522
523	# Remove everything that is in a tag
524	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
525	s/\s<br>\s/ LINEBREAK /isgo;
526	s/<[^>]*>/ /sgo;
527
528	# Now we have the text, but it may contain HTML
529	# elements coded as > etc. Remove these tags.
530	s/&/&/sgo;
531	s/</</sgo;
532	s/>/>/sgo;
533	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
534	s/\s<br>\s/ LINEBREAK /isgo;
535	s/<[^>]*>/ /sgo;
536
537	# replace<p> and <br> placeholders with clause break symbol (\n)
538	s/\s+/ /gso;
539	s/PARAGRAPHBREAK/\n/sgo;
540	s/LINEBREAK/\n/sgo;
541
542
543
544
545	# 2. Split the remaining text into space-delimited tokens
546
547	# Convert any HTML special characters (like ") to their UTF8 equivalent
548	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
549
550	# Split text at word boundaries
551	s/\b/ /go;
552
553	# 3. Convert the remaining text to "clause format"
554
555	# Insert newline if the end of a sentence is detected
556	# (delimter is: "[\.\?\!]\s")
557	# s/\s*[\.\?\!]\s+/\n/go;
558
559	# remove unnecessary punctuation and replace with clause break symbol (\n)
560	s/[^\w ]/\n/go;
561
562	# remove extraneous whitespace
563	s/ +/ /sgo;
564	s/^\s+//mgo;
565	s/\s*$/\n/mgo;
566
567	# remove lines that contain one word or less
568	s/^\S*$//mgo;
569	s/^\s*$//mgo;
570	tr/\n//s;
571
572	return $_;
573	}
574
575	# a chinese version
576	sub convert_gml_to_tokens_ZH {
577
578	$_ = shift @_;
579
580	# Replace all whitespace with a simple space
581	s/\s+/ /gs;
582	# Remove everything that is in a tag
583	s/\s<p>\s/ PARAGRAPHBREAK /isg;
584	s/\s<br>\s/ LINEBREAK /isg;
585	s/<[^>]*>/ /sg;
586
587	# Now we have the text, but it may contain HTML
588	# elements coded as > etc. Remove these tags.
589	s/</</sg;
590	s/>/>/sg;
591
592	s/\s+/ /sg;
593	s/\s<p>\s/ PARAGRAPHBREAK /isg;
594	s/\s<br>\s/ LINEBREAK /isg;
595	s/<[^>]*>/ /sg;
596
597	# remove & and other miscellaneous markup tags
598	s/&/&/sg;
599	s/</</sg;
600	s/>/>/sg;
601	s/&/&/sg;
602
603	# replace<p> and <br> placeholders with carriage returns
604	s/PARAGRAPHBREAK/\n/sg;
605	s/LINEBREAK/\n/sg;
606
607
608	# print STDERR "text:$_\n";
609	return $_;
610	}
611
612	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
613
614	sub convert_gml_to_tokens_EN {
615	$_ = shift @_;
616
617	# FIRST, remove GML tags
618
619	# Replace all whitespace with a simple space
620	s/\s+/ /gs;
621
622	# Remove everything that is in a tag
623	s/\s<p>\s/ PARAGRAPHBREAK /isg;
624	s/\s<br>\s/ LINEBREAK /isg;
625	s/<[^>]*>/ /sg;
626
627	# Now we have the text, but it may contain HTML
628	# elements coded as > etc. Remove these tags.
629	s/</</sg;
630	s/>/>/sg;
631
632	s/\s+/ /sg;
633	s/\s<p>\s/ PARAGRAPHBREAK /isg;
634	s/\s<br>\s/ LINEBREAK /isg;
635	s/<[^>]*>/ /sg;
636
637	# remove & and other miscellaneous markup tags
638	s/&/&/sg;
639	s/</</sg;
640	s/>/>/sg;
641	s/&/&/sg;
642
643	# replace<p> and <br> placeholders with carriage returns
644	s/PARAGRAPHBREAK/\n/sg;
645	s/LINEBREAK/\n/sg;
646
647
648	# Exceptional punctuation
649	#
650	# We make special cases of some punctuation
651
652	# remove any apostrophe that indicates omitted letters
653	s/(\w+)\'(\w*\s)/ $1$2 /g;
654
655	# remove period that appears in a person's initals
656	s/\s([A-Z])\./ $1 /g;
657
658	# replace hyphens in hypheanted words and names with a space
659	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
660
661	# Convert the remaining text to "clause format",
662	# This means removing all excess punctuation and garbage text,
663	# normalising valid punctuation to fullstops and commas,
664	# then putting one cluse on each line.
665
666	# Insert newline when the end of a sentence is detected
667	# (delimter is: "[\.\?\!]\s")
668	s/\s*[\.\?\!]\s+/\n/g;
669
670	# split numbers after four digits
671	s/(\d\d\d\d)/$1 /g;
672
673	# split words after 32 characters
674
675	# squash repeated punctuation
676	tr/A-Za-z0-9 //cs;
677
678	# save email addresses
679	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
680
681	# normalise clause breaks (mostly punctuation symbols) to commas
682	s/[^A-Za-z0-9 \n]+/ , /g;
683
684	# Remove repeated commas, and replace with newline
685	s/\s*,[, ]+/\n/g;
686
687	# remove extra whitespace
688	s/ +/ /sg;
689	s/^\s+//mg;
690	s/\s*$/\n/mg;
691
692	# remove lines that contain one word or less
693	s/^\w*$//mg;
694	s/^\s*$//mg;
695	tr/\n//s;
696
697	return $_;
698
699	}
700
701
702
703	# Execute a system command
704
705	sub execute {
706	my ($command, $verbosity, $outhandle) = @_;
707	print $outhandle "Executing: $command\n" if ($verbosity > 2);
708	$! = 0;
709	my $status = system($command);
710	if ($status != 0) {
711	print STDERR "Phind - Error executing '$command': $!\n";
712	exit($status); # this causes the build to fail...
713	}
714	}
715
716
717	# Generate the vocabulary, symbol statistics, and numbers file from the
718	# clauses file. This is legacy code, so is a bit messy and probably wont
719	# run under windows.
720
721	sub extract_vocabulary {
722	my ($self) = @_;
723
724	my $verbosity = $self->{'verbosity'};
725	my $out = $self->{'outhandle'};
726
727	my $collectiondir = $self->{'collectiondir'};
728	my $phinddir = $self->{'phinddir'};
729
730	my $language_exp = $self->{'language_exp'};
731
732	my ($w, $l, $line, $word);
733
734	my ($first_delimiter, $last_delimiter,
735	$first_stopword, $last_stopword,
736	$first_extractword, $last_extractword,
737	$first_contentword, $last_contentword,
738	$phrasedelimiter);
739
740	my $thesaurus = $self->{'thesaurus'};
741	my ($thesaurus_links, $thesaurus_terms,
742	%thesaurus, $first_thesaurusword, $last_thesaurusword);
743
744	my %symbol;
745	my (%freq);
746
747	print $out "Calculating vocabulary\n" if ($verbosity > 1);
748
749	# Read and store the stopwords
750	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
751	my $stopword_files = ();
752	my ($language, $language_dir, $file, $file_name);
753	my %stopwords;
754
755	# Examine each directory in the stopword directory
756	opendir(STOPDIR, $stopdir);
757	foreach $language (readdir STOPDIR) {
758
759	# Ignore entries that do not match the classifier's language
760	next unless ($language =~ /$language_exp/);
761	$language_dir = &util::filename_cat($stopdir, $language);
762	next unless (-d "$language_dir");
763
764	opendir(LANGDIR, $language_dir);
765	foreach $file (readdir LANGDIR) {
766
767	# Ignore entries that are not stopword files
768	next unless ($file =~ /sw$/);
769	$file_name = &util::filename_cat($language_dir, $file);
770	next unless (-f "$file_name");
771
772	# Read the stopwords
773	open(STOPFILE, "<$file_name");
774	while (<STOPFILE>) {
775	s/^\s+//;
776	s/\s.*//;
777	$word = $_;
778	$l = lc($word);
779	$stopwords{$l} = $word;
780	}
781	close STOPFILE;
782
783	}
784	closedir LANGDIR;
785	}
786	closedir STOPDIR;
787
788	# Read thesaurus information
789	if ($thesaurus) {
790
791	# link file exists
792	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
793	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
794
795	# ensure term file exists in the correct language
796	if ($language_exp =~ /^([a-z][a-z])/) {
797	$language = $1;
798	} else {
799	$language = 'en';
800	}
801	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
802	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
803
804
805	# Read the thesaurus terms
806	open(TH, "<$thesaurus_terms");
807	while(<TH>) {
808	s/^\d+ //;
809	s/\(.*\)//;
810	foreach $w (split(/\s+/, $_)) {
811	$thesaurus{lc($w)} = $w;
812	}
813	}
814	close TH;
815	}
816
817	# Read words in the text and count occurences
818	open(TXT, "<$phinddir/clauses");
819
820	my @words;
821	while(<TXT>) {
822	$line = $_;
823	next unless ($line =~ /./);
824
825	@words = split(/\s+/, $line);
826	foreach $w (@words) {
827	$l = lc($w);
828	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
829	$freq{$w}++;
830	}
831	$freq{$senlimit}++;
832	}
833
834	close TXT;
835
836	# Calculate the "best" form of each word
837	my (%bestform, %totalfreq, %bestfreq);
838
839	foreach $w (sort (keys %freq)) {
840	$l = lc($w);
841
842	# totalfreq is the number of times a term appears in any form
843	$totalfreq{$l} += $freq{$w};
844
845	if (defined $stopwords{$l}) {
846	$bestform{$l} = $stopwords{$l};
847
848	} elsif (defined $thesaurus{$l}) {
849	$bestform{$l} = $thesaurus{$l};
850
851	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
852	$bestfreq{$l} = $freq{$w};
853	$bestform{$l} = $w;
854	}
855	}
856	undef %freq;
857	undef %bestfreq;
858
859
860	# Assign symbol numbers to tokens
861	my $nextsymbol = 1;
862	my (@vocab);
863
864	# Delimiters
865	$first_delimiter = 1;
866
867	foreach $word (@delimiters) {
868
869	# $word = lc($word); # jrm21
870	$word = uc($word);
871	$bestform{$word} = $word;
872	$vocab[$nextsymbol] = $word;
873	$symbol{$word} = $nextsymbol;
874	$nextsymbol++;
875	}
876	$last_delimiter = $nextsymbol - 1;
877	# Stopwords
878	$first_stopword = $nextsymbol;
879
880	foreach my $word (sort keys %stopwords) {
881	# don't include stopword unless it occurs in the text
882	$word = lc($word);
883	next unless ($totalfreq{$word});
884	next if ($symbol{$word});
885
886	$vocab[$nextsymbol] = $word;
887	$symbol{$word} = $nextsymbol;
888	$nextsymbol++;
889	}
890	$last_stopword = $nextsymbol - 1;
891	$first_contentword = $nextsymbol;
892
893	# Thesaurus terms
894	if ($thesaurus) {
895	$first_thesaurusword = $nextsymbol;
896
897	foreach my $word (sort keys %thesaurus) {
898
899	$word = lc($word);
900	next if ($symbol{$word});
901	$bestform{$word} = $thesaurus{$word};
902
903	$vocab[$nextsymbol] = $word;
904	$symbol{$word} = $nextsymbol;
905	$nextsymbol++;
906
907	}
908	$last_thesaurusword = $nextsymbol - 1;
909	}
910
911	# Other content words
912	$first_extractword = $nextsymbol;
913
914	foreach my $word (sort (keys %bestform)) {
915
916	next if ($symbol{$word});
917
918	$vocab[$nextsymbol] = $word;
919	$symbol{$word} = $nextsymbol;
920	$nextsymbol++;
921	}
922	$last_extractword = $nextsymbol - 1;
923	$last_contentword = $nextsymbol - 1;
924
925	# Outut the words
926	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
927	open(VOC, ">$phinddir/clauses.vocab");
928
929	for (my $i = 1; $i < $nextsymbol; $i++) {
930	$w = $vocab[$i];
931
932	print VOC "$bestform{$w}\n";
933	$totalfreq{$w} = 0 unless ($totalfreq{$w});
934	}
935	close VOC;
936
937
938	# Create statistics file
939	# Output statistics about the vocablary
940	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
941	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
942
943	open(STAT, ">$phinddir/clauses.stats")
944	\|\| die "Cannot open $phinddir/clauses.stats: $!";
945
946	print STAT "first_delimiter $first_delimiter\n";
947	print STAT "last_delimiter $last_delimiter\n";
948	print STAT "first_stopword $first_stopword\n";
949	print STAT "last_stopword $last_stopword\n";
950	if ($thesaurus) {
951	print STAT "first_thesaurusword $first_thesaurusword\n";
952	print STAT "last_thesaurusword $last_thesaurusword\n";
953	}
954	print STAT "first_extractword $first_extractword\n";
955	print STAT "last_extractword $last_extractword\n";
956	print STAT "first_contentword $first_contentword\n";
957	print STAT "last_contentword $last_contentword\n";
958	print STAT "first_symbol $first_delimiter\n";
959	print STAT "last_symbol $last_contentword\n";
960	print STAT "first_word $first_stopword\n";
961	print STAT "last_word $last_contentword\n";
962	close STAT;
963
964	undef @vocab;
965
966
967	# Create numbers file
968	# Save text as symbol numbers
969	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
970
971	open(TXT, "<$phinddir/clauses");
972	open(NUM, ">$phinddir/clauses.numbers");
973
974	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
975	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
976	$phrasedelimiter = $symbol{$senlimit};
977	print NUM "$symbol{$colstart}\n";
978
979	# set up the special symbols that delimit documents and sentences
980	while(<TXT>) {
981
982	# split sentence into a list of tokens
983	$line = $_;
984	next unless ($line =~ /./);
985	@words = split(/\s+/, $line);
986
987	# output one token at a time
988	foreach $word (@words) {
989	# don't lower-case special delimiters - jrm21
990	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
991	$word = lc($word);
992	}
993	print NUM "$symbol{$word}\n";
994	}
995
996	# output phrase delimiter
997	print NUM "$phrasedelimiter\n";
998	}
999
1000	close TXT;
1001	# print NUM "$symbol{lc($colend)}\n";# jrm21
1002	print NUM "$symbol{$colend}\n";
1003	close NUM;
1004
1005	# Save thesaurus data in one convienient file
1006	if ($thesaurus) {
1007
1008	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1009
1010
1011	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
1012	if ($verbosity > 1);
1013
1014	# Read the thesaurus terms
1015	my ($num, $text, %thes_symbols);
1016
1017	open(TH, "<$thesaurus_terms");
1018	while(<TH>) {
1019	chomp;
1020	@words = split(/\s+/, $_);
1021	$num = shift @words;
1022	$text = "";
1023
1024	# translate words into symbol numbers
1025	foreach $word (@words) {
1026	$word = lc($word);
1027	if ($symbol{$word}) {
1028	$text .= "s$symbol{$word} ";
1029	} elsif ($verbosity) {
1030	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1031	}
1032	}
1033	$text =~ s/ $//;
1034	$thes_symbols{$num} = $text;
1035	}
1036	close TH;
1037
1038	# Read the thesaurus links and write the corresponding data
1039	open(TH, "<$thesaurus_links");
1040	open(THOUT, ">$thesaurusfile");
1041
1042	while(<TH>) {
1043	chomp;
1044	($num, $text) = split(/:/, $_);
1045
1046	if (defined($thes_symbols{$num})) {
1047	print THOUT "$num:$thes_symbols{$num}:$text\n";
1048	} else {
1049	print THOUT "$num:untranslated:$text\n";
1050	}
1051	}
1052	close TH;
1053	close THOUT;
1054	}
1055
1056
1057
1058
1059	}
1060
1061
1062	# renumber_phrases
1063	#
1064	# Prepare the phrases file to be input to mgpp. The biggest problem is
1065	# reconciling the phrase identifiers used by the suffix program (which
1066	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1067	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1068	# Phind-id numbers must be sorted by frequency of occurance.
1069	#
1070	# Start creating a set of phind-id numbers from the sorted suffix-id
1071	# numbers and (if required) the thesaurus-id numbers. Then add any other
1072	# phrases occuring in the thesaurus.
1073	#
1074	# The last thing we have to do is restore the vocabulary information to the
1075	# phrase file so that the phrases are stored as words, not as symbol
1076	# numbers.
1077
1078	# The original phrases file looks something like this:
1079	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1080	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1081	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1082	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1083
1084
1085	sub renumber_phrases {
1086	my ($self) = @_;
1087
1088	renumber_suffix_data($self);
1089	renumber_thesaurus_data($self);
1090	restore_vocabulary_data($self);
1091
1092	}
1093
1094
1095
1096	# renumber_suffix_data
1097	#
1098	# Translate phrases file to phrases.2 using phind keys instead
1099	# of suffix keys and sorting the expansion data.
1100
1101	sub renumber_suffix_data {
1102	my ($self) = @_;
1103
1104	my $verbosity = $self->{'verbosity'};
1105	my $out = $self->{'outhandle'};
1106	print $out "Translate phrases: suffix-ids become phind-id's\n"
1107	if ($verbosity);
1108
1109	my $phinddir = $self->{'phinddir'};
1110	my $infile = &util::filename_cat($phinddir, 'phrases');
1111	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1112
1113	# Read the phrase file. Calculate initial set of phind-id
1114	# numbers and store (suffixid -> frequency) relation.
1115
1116	my %suffixtophind;
1117	my @phindfrequency;
1118	my (@fields, $suffixid);
1119	my $nextphind = 1;
1120
1121	open(IN, "<$infile");
1122	while(<IN>) {
1123
1124	chomp;
1125	@fields = split(/:/, $_);
1126
1127	# get next suffixid and phindid
1128	$suffixid = shift @fields;
1129	$suffixtophind{$suffixid} = $nextphind;
1130
1131	# store total frequency
1132	shift @fields;
1133	$totalfrequency[$nextphind] = shift @fields;
1134
1135	$nextphind++;
1136	}
1137	close IN;
1138
1139
1140	# Translate phrases file to phrases.2. Use phind keys (not suffix
1141	# keys), sort expansion and document occurance data in order of
1142	# descending frequency..
1143	open(IN, "<$infile");
1144	open(OUT, ">$outfile");
1145
1146	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1147	my (@documwents, @newexp, $k, $n);
1148	my $linenumber = 0;
1149
1150	while(<IN>) {
1151
1152	# read the line
1153	chomp;
1154	@fields = split(/:/, $_);
1155
1156	# get a phrase number for this line
1157	$suffixid = shift @fields;
1158	die unless (defined($suffixtophind{$suffixid}));
1159	$phindid = $suffixtophind{$suffixid};
1160
1161	# get the symbols in the phrase
1162	$text = shift @fields;
1163
1164	# output status information
1165	$linenumber++;
1166	if ($verbosity > 2) {
1167	if ($linenumber % 1000 == 0) {
1168	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1169	}
1170	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1171	}
1172
1173	# get the phrase frequency
1174	$tf = shift @fields;
1175
1176	# get the number of expansions
1177	$countexp = shift @fields;
1178
1179	# get the expansions, convert them into phind-id numbers, and sort them
1180	$expansions = shift @fields;
1181	@newexp = ();
1182	foreach $k (split(/,/, $expansions)) {
1183	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1184	$n = $suffixtophind{$k};
1185	push @newexp, $n;
1186	}
1187	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1188
1189	# get the number of documents
1190	$countdocs = shift @fields;
1191
1192	# get the documents and sort them
1193	$documents = shift @fields;
1194	$documents =~ s/d//g;
1195	@documents = split(/;/, $documents);
1196	@documents = sort by_doc_frequency @documents;
1197
1198	# output the phrase data
1199	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1200	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1201
1202	}
1203
1204	close IN;
1205	close OUT;
1206	}
1207
1208
1209	# renumber_thesaurus_data
1210	#
1211	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1212
1213	sub renumber_thesaurus_data {
1214	my ($self) = @_;
1215
1216	my $out = $self->{'outhandle'};
1217	my $verbosity = $self->{'verbosity'};
1218	my $thesaurus = $self->{'thesaurus'};
1219
1220	my $phinddir = $self->{'phinddir'};
1221	my $infile = &util::filename_cat($phinddir, "phrases.2");
1222	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1223
1224
1225	# If no thesaurus is defined, simply move the phrases file.
1226	if (!$thesaurus) {
1227	print $out "Translate phrases.2: no thesaurus data\n"
1228	if ($verbosity);
1229	&util::mv($infile, $outfile);
1230	return;
1231	}
1232
1233	print $out "Translate phrases.2: add thesaurus data\n"
1234	if ($verbosity);
1235
1236	# 1.
1237	# Read thesaurus file and store (symbols->thesaurusid) mapping
1238	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1239	my %symbolstothesid;
1240	my (@fields, $thesid, $symbols);
1241
1242	open(TH, "<$thesaurusfile");
1243
1244	while (<TH>) {
1245
1246	chomp;
1247	@fields = split(/:/, $_);
1248
1249	# get id and text
1250	$thesid = shift @fields;
1251	$symbols = shift @fields;
1252	$symbolstothesid{$symbols} = $thesid;
1253	}
1254	close TH;
1255
1256	# 2.
1257	# Read phrases file to find thesaurus entries that already
1258	# have a phindid. Store their phind-ids for later translation,
1259	# and store their frequency for later sorting.
1260	my %thesaurustophindid;
1261	my %phindidtofrequency;
1262	my ($phindid, $freq);
1263
1264	open(IN, "<$infile");
1265
1266	while(<IN>) {
1267
1268	chomp;
1269	@fields = split(/:/, $_);
1270
1271	# phindid and symbols for this line
1272	$phindid = shift @fields;
1273	$symbols = shift @fields;
1274	$freq = shift @fields;
1275
1276	# do we have a thesaurus id corresponding to this phrase?
1277	if (defined($symbolstothesid{$symbols})) {
1278	$thesid = $symbolstothesid{$symbols};
1279	$thesaurustophindid{$thesid} = $phindid;
1280	$phindidtofrequency{$phindid} = $freq;
1281	}
1282	}
1283	close IN;
1284
1285	undef %symbolstothesid;
1286
1287	# 3.
1288	# Create phind-id numbers for remaining thesaurus entries,
1289	# and note that their frequency is 0 for later sorting.
1290	my $nextphindid = $phindid + 1;
1291
1292	open(TH, "<$thesaurusfile");
1293	while(<TH>) {
1294
1295	chomp;
1296	@fields = split(/:/, $_);
1297
1298	# read thesaurus-id and ensure it has a corresponding phind-id
1299	$thesid = shift @fields;
1300	if (!defined($thesaurustophindid{$thesid})) {
1301	$thesaurustophindid{$thesid} = $nextphindid;
1302	$phindidtofrequency{$nextphindid} = 0;
1303	$nextphindid++;
1304	}
1305	}
1306	close TH;
1307
1308	# 4.
1309	# Translate thesaurus file, replacing thesaurus-id numbers with
1310	# phind-id numbers.
1311	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1312	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1313	my (@links, $linkid, %linkidtotype, $newrelation);
1314
1315	open(TH, "<$thesaurusfile");
1316	open(TO, ">$newthesaurusfile");
1317	while(<TH>) {
1318
1319	chomp;
1320	@fields = split(/:/, $_);
1321
1322	# phindid and symbols for this line
1323	($thesid, $symbols, $relations) = @fields;
1324
1325	die unless ($thesid && $symbols);
1326	die unless $thesaurustophindid{$thesid};
1327	$phindid = $thesaurustophindid{$thesid};
1328
1329	# convert each part of the relation string to use phind-id numbers
1330	# at the same time, we want to sort the list by frequency.
1331	undef %linkidtotype;
1332
1333	foreach $linktext (split(/;/, $relations)) {
1334	@linkdata = split(/,/, $linktext);
1335
1336	# remember the linktype (e.g. BT, NT)
1337	$linktype = shift @linkdata;
1338
1339	# store the type of each link
1340	foreach $thesid (@linkdata) {
1341	die unless (defined($thesaurustophindid{$thesid}));
1342	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1343	}
1344	}
1345
1346	# sort the list of links, first by frequency, then by type.
1347	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1348	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1349	$linkcounter = (scalar @links);
1350
1351	# create a string describing the link information
1352	$linktype = $linkidtotype{$links[0]};
1353	$newrelation = $linktype;
1354	foreach $linkid (@links) {
1355	if ($linkidtotype{$linkid} ne $linktype) {
1356	$linktype = $linkidtotype{$linkid};
1357	$newrelation .= ";" . $linktype;
1358	}
1359	$newrelation .= "," . $linkid;
1360	}
1361	$newrelation .= ";";
1362
1363
1364	# output the new line
1365	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1366	}
1367	close TH;
1368	close TO;
1369
1370	undef %thesaurustophindid;
1371	undef %linkidtotype;
1372	undef %phindidtofrequency;
1373
1374	# 5.
1375	# Read thesaurus data (in phind-id format) into memory
1376	my %thesaurusdata;
1377
1378	open(TH, "<$newthesaurusfile");
1379	while(<TH>) {
1380	chomp;
1381	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1382	die unless ($phindid && $symbols);
1383	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1384	}
1385	close TH;
1386
1387	# 6.
1388	# Add thesaurus data to phrases file
1389	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1390	my (@documwents, @newexp, $k, $n);
1391	my $linenumber = 0;
1392
1393	open(IN, "<$infile");
1394	open(OUT, ">$outfile");
1395
1396	# Update existing phrases
1397	while(<IN>) {
1398
1399	chomp;
1400	@fields = split(/:/, $_);
1401
1402	# get data for this line
1403	$phindid = shift @fields;
1404
1405	# output the phrase data, with thesaurus information
1406	print OUT "$phindid:", join(":", @fields);
1407
1408	# add thesaurus data
1409	if (defined($thesaurusdata{$phindid})) {
1410	@fields = split(/:/, $thesaurusdata{$phindid});
1411	shift @fields;
1412	$linkcounter = shift @fields;
1413	$relations = shift @fields;
1414
1415	print OUT ":$linkcounter:$relations";
1416	$thesaurusdata{$phindid} = "";
1417	}
1418	print OUT "\n";
1419	}
1420	close IN;
1421
1422	# Add phrases that aren't already in the file
1423	foreach $phindid (sort numerically keys %thesaurusdata) {
1424	next unless ($thesaurusdata{$phindid});
1425
1426	@fields = split(/:/, $thesaurusdata{$phindid});
1427	$symbols = shift @fields;
1428	$linkcounter = shift @fields;
1429	$relations = shift @fields;
1430
1431	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1432	}
1433	close OUT;
1434
1435	}
1436
1437	# restore_vocabulary_data
1438	#
1439	# Read phrases.3 and restore vocabulary information. Then write
1440	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1441	# (if requested) to the saved phrases file.
1442
1443	sub restore_vocabulary_data {
1444	my ($self) = @_;
1445
1446	my $out = $self->{'outhandle'};
1447	my $verbosity = $self->{'verbosity'};
1448	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1449
1450	my $phinddir = $self->{'phinddir'};
1451	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1452	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1453	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1454	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1455
1456	my $savephrases = $self->{'savephrases'};
1457
1458	# 1.
1459	# Read the vocabulary file
1460	open(V, "<$vocabfile")
1461	\|\| die "Cannot open $vocabfile: $!";
1462	my @symbol;
1463	my $i = 1;
1464	while(<V>) {
1465	chomp;
1466	$symbol[$i++] = $_;
1467	}
1468	close V;
1469
1470	# 2.
1471	# Translate phrases.3 to MGPP input files
1472	my ($key, $text, $word, $isThesaurus);
1473	my @fields;
1474	my $linenumber = 0;
1475
1476	open(IN, "<$infile");
1477	open(DATA, ">$datafile");
1478	open(WORD, ">$wordfile");
1479
1480	# Save the phrases in a separate text file
1481	if ($savephrases) {
1482	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1483	open(SAVE, ">$savephrases");
1484	}
1485
1486	while(<IN>) {
1487
1488	# read the line
1489	chomp;
1490	$line = $_;
1491	@fields = split(/:/, $line);
1492
1493	# get a phrase number for this line
1494	$key = shift @fields;
1495
1496	# restore the text of the phrase
1497	$text = shift @fields;
1498	$text =~ s/s(\d+)/$symbol[$1]/g;
1499	if ($text =~ / /) {
1500	$word = "";
1501	} elsif ($text ne 'untranslated') {
1502	$word = $text;
1503	}
1504
1505	# output the phrase data
1506	print DATA "<Document>";
1507	print DATA "$key:$text:", join(":", @fields), ":\n";
1508
1509	# output the word index search data
1510	print WORD "<Document>$word\n";
1511
1512	# output the phrases to a text file
1513	if ($savephrases) {
1514	if ((scalar @fields) == 7) {
1515	$isThesaurus = 1;
1516	} else {
1517	$isThesaurus = 0;
1518	}
1519	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1520	}
1521	}
1522	close IN;
1523	close WORD;
1524	close DATA;
1525	close SAVE if ($savephrases);
1526
1527	}
1528
1529
1530
1531	# sort routines used to renumber phrases
1532
1533	sub numerically { $a <=> $b }
1534
1535	sub by_doc_frequency {
1536	my $fa = 1;
1537	if ($a =~ /,/) {
1538	$fa = $a;
1539	$fa =~ s/\d+,//;
1540	}
1541	my $fb = 1;
1542	if ($b =~ /,/) {
1543	$fb = $b;
1544	$fb =~ s/\d+,//;
1545	}
1546
1547	return ($fb <=> $fa);
1548	}
1549
1550	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: