Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

Phind.pm

Last change on this file was 28560, checked in by ak19, 10 years ago
New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
Property svn:keywords set to `Author Date Id Revision`
File size: 41.6 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	package Phind;
33
34	use BaseClassifier;
35	use FileUtils;
36	use util;
37	use ghtml;
38	use unicode;
39
40	use strict;
41	no strict 'refs'; # allow filehandles to be variables and viceversa
42
43	my @removedirs = ();
44
45	my %wanted_index_files = ('td'=>1,
46	't'=>1,
47	'ti'=>1,
48	'tl'=>1,
49	'tsd'=>1,
50	'idb'=>1,
51	'ib1'=>1,
52	'ib2'=>1,
53	'ib3'=>1,
54	'i'=>1,
55	'il'=>1,
56	'w'=>1,
57	'wa'=>1);
58
59	sub BEGIN {
60	@Phind::ISA = ('BaseClassifier');
61	}
62
63	sub END {
64
65	# Tidy up stray files - we do this here as there's some weird problem
66	# preventing us from doing it in the get_classify_info() function (on
67	# windows at least) where the close() appears to fail on txthandle and
68	# dochandle, thus preventing us from deleting those files
69
70	foreach my $dir (@removedirs) {
71	if (-d $dir && opendir (DIR, $dir)) {
72	my @files = readdir DIR;
73	closedir DIR;
74
75	foreach my $file (@files) {
76	next if $file =~ /^\.\.?$/;
77	my ($suffix) = $file =~ /\.([^\.]+)$/;
78	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
79	# delete it!
80	&FileUtils::removeFiles (&FileUtils::filenameConcatenate ($dir, $file));
81	}
82	}
83	}
84	}
85	}
86
87	my $arguments =
88	[ { 'name' => "text",
89	'desc' => "{Phind.text}",
90	'type' => "string",
91	'deft' => "section:Title,section:text",
92	'reqd' => "no" },
93	{ 'name' => "title",
94	'desc' => "{Phind.title}",
95	'type' => "metadata",
96	'deft' => "Title",
97	'reqd' => "no" },
98	{ 'name' => "buttonname",
99	'desc' => "{BasClas.buttonname}",
100	'type' => "string",
101	'deft' => "Phrase",
102	'reqd' => "no" },
103	{ 'name' => "language",
104	'desc' => "{Phind.language}",
105	'type' => "string",
106	'deft' => "en",
107	'reqd' => "no" },
108	{ 'name' => "savephrases",
109	'desc' => "{Phind.savephrases}",
110	'type' => "string",
111	'deft' => "",
112	'reqd' => "no" },
113	{ 'name' => "suffixmode",
114	'desc' => "{Phind.suffixmode}",
115	'type' => "int",
116	'deft' => "1",
117	'range' => "0,1",
118	'reqd' => "no" },
119	{ 'name' => "min_occurs",
120	'desc' => "{Phind.min_occurs}",
121	'type' => "int",
122	'deft' => "2",
123	'range' => "1,",
124	'reqd' => "no" },
125	{ 'name' => "thesaurus",
126	'desc' => "{Phind.thesaurus}",
127	'type' => "string",
128	'deft' => "",
129	'reqd' => "no" },
130	{ 'name' => "untidy",
131	'desc' => "{Phind.untidy}",
132	'type' => "flag",
133	'reqd' => "no" } ];
134
135	my $options = { 'name' => "Phind",
136	'desc' => "{Phind.desc}",
137	'abstract' => "no",
138	'inherits' => "yes",
139	'args' => $arguments };
140
141
142	# Phrase delimiter symbols - these should be abstracted out someplace
143
144	my $colstart = "COLLECTIONSTART";
145	my $colend = "COLLECTIONEND";
146	my $doclimit = "DOCUMENTLIMIT";
147	my $senlimit = "SENTENCELIMIT";
148	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
149
150
151	# Create a new Phind browser based on collect.cfg
152
153	sub new {
154	my ($class) = shift (@_);
155	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
156	push(@$classifierslist, $class);
157
158	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
159	push(@{$hashArgOptLists->{"OptList"}},$options);
160
161	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
162
163	if ($self->{'info_only'}) {
164	# don't worry about any options etc
165	return bless $self, $class;
166	}
167
168	# Ensure the Phind generate scripts are in place
169
170	# the suffix binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
171	&util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
172
173	my $file1 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
174	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
175	my $src = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "src", "phind", "generate");
176	if (!(-e $file1)) {
177	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
178	exit(1);
179	}
180
181	# things that may have ex. in them that need to be stripped off
182	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
183	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
184
185	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
186
187	$self->{"indexes"} = $self->{"text"};
188
189	# Further setup
190	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
191	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
192	if (! defined $self->{'builddir'}) {
193	$self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "building");
194	}
195	$self->{'total'} = 0;
196
197	# we set phind to be rtl if language is arabic
198	if ($self->{'language'} eq "ar") {
199	$self->{'textorientation'} = "rtl";
200	}
201	# Clean out the unused keys
202	delete $self->{"text"};
203
204	return bless $self, $class;
205	}
206
207
208	# Initialise the Phind classifier
209
210	sub init {
211	my $self = shift (@_);
212
213	# ensure we have a build directory
214	my $builddir = $self->{'builddir'};
215	die unless (-e "$builddir");
216
217	# create Phind directory
218	my $phnumber = 1;
219	my $phinddir = &FileUtils::filenameConcatenate($builddir, "phind1");
220	while (-e "$phinddir") {
221	$phnumber++;
222	$phinddir = &FileUtils::filenameConcatenate($builddir, "phind$phnumber");
223	}
224	&FileUtils::makeDirectory("$phinddir");
225	$self->{'phinddir'} = $phinddir;
226	$self->{'phindnumber'} = $phnumber;
227
228	push(@removedirs, $phinddir) unless $self->{'untidy'};
229
230	# open filehandles for documents and text
231	my $clausefile = &FileUtils::filenameConcatenate("$phinddir", "clauses");
232	&FileUtils::removeFiles($clausefile) if (-e $clausefile);
233
234	my $txthandle = 'TEXT' . $phnumber;
235	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
236	$self->{'txthandle'} = $txthandle;
237
238	my $docfile = &FileUtils::filenameConcatenate("$phinddir", "docs.txt");
239	&FileUtils::removeFiles($docfile) if (-e $docfile);
240
241	my $dochandle = 'DOC' . $phnumber;
242	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
243	$self->{'dochandle'} = $dochandle;
244
245	}
246
247
248	# Classify each document.
249	#
250	# Each document is passed here in turn. The classifier extracts the
251	# text of each and stores it in the clauses file. Document details are
252	# stored in the docs.txt file.
253
254	sub classify {
255	my $self = shift (@_);
256	my ($doc_obj) = @_;
257
258	my $verbosity = $self->{'verbosity'};
259	my $top_section = $doc_obj->get_top_section();
260
261	my $titlefield = $self->{'title'};
262
263	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
264	if (!defined($title)) {
265	$title = "";
266	print STDERR "Phind: document has no title\n";
267	}
268	print "process: $title\n" if ($verbosity > 2);
269
270	# Only consider the file if it is in the correct language
271	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
272	my $phrlanguage = $self->{'language'};
273	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
274
275	# record this file
276	$self->{'total'} ++;
277	# what is $file ???
278	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
279
280	# Store document details
281	my $OID = $doc_obj->get_OID();
282	$OID = "NULL" unless defined $OID;
283	my $dochandle = $self->{'dochandle'};
284	print $dochandle "<Document>\t$OID\t$title\n";
285
286	# Store the text occuring in this object
287
288	# output the document delimiter
289	my $txthandle = $self->{'txthandle'};
290	print $txthandle "$doclimit\n";
291
292	# iterate over the required indexes and store their text
293	my $indexes = $self->{'indexes'};
294	my $text = "";
295	my ($part, $level, $field, $section, $data, $dataref);
296
297	foreach $part (split(/,/, $indexes)) {
298
299	# Each field has a level and a data element ((e.g. document:Title)
300	($level, $field) = split(/:/, $part);
301	die unless ($level && $field);
302
303	# Extract the text from every section
304	# (In phind, document:text and section:text are equivalent)
305	if ($field eq "text") {
306	$data = "";
307	$section = $doc_obj->get_top_section();
308	while (defined($section)) {
309	$data .= $doc_obj->get_text($section) . "\n";
310	$section = $doc_obj->get_next_section($section);
311	}
312	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
313	}
314
315	# Extract a metadata field from a document
316	# (If there is more than one element of the given type, get them all.)
317	elsif ($level eq "document") {
318	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
319	foreach $data (@$dataref) {
320	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
321	}
322	}
323
324	# Extract metadata from every section in a document
325	elsif ($level eq "section") {
326	$data = "";
327	$section = $doc_obj->get_top_section();
328	while (defined($section)) {
329	$dataref = $doc_obj->get_metadata($section, $field);
330	$data .= join("\n", @$dataref) . "\n";
331	$section = $doc_obj->get_next_section($section);
332	}
333	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
334	}
335
336	# Some sort of specification which I don't understand
337	else {
338	die "Unknown level ($level) in Phind index ($part)\n";
339	}
340
341	}
342
343	# output the text
344	$text =~ tr/\n//s;
345	print $txthandle "$text";
346	}
347
348
349	# Construct the classifier from the information already gathered
350	#
351	# When get_classify_info is called, the clauses and docs.txt files have
352	# already been constructed in the Phind directory. This function will
353	# translate them into compressed, indexed MGPP files that can be read by
354	# the phindcgi script. It will also register our classifier so that it
355	# shows up in the navigation bar.
356
357	sub get_classify_info {
358	my $self = shift (@_);
359	my ($gli) = @_;
360
361	close $self->{'dochandle'};
362	close $self->{'txthandle'};
363	my $verbosity = $self->{'verbosity'};
364	my $out = $self->{'outhandle'};
365	my $phinddir = $self->{'phinddir'};
366
367	my $osextra = "";
368	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
369	$osextra = " -d /";
370	}
371
372	print STDERR "</Stage>\n" if $gli;
373
374	if ($verbosity) {
375	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
376	print $out "*** in ", $self->{'phinddir'}, "\n";
377	}
378
379	print STDERR "<Stage name='Phind'>\n" if $gli;
380
381	# Construct phind indexes
382	my $suffixmode = $self->{'suffixmode'};
383	my $min_occurs = $self->{'min_occurs'};
384	my ($command, $status);
385
386	# Generate the vocabulary, symbol statistics, and numbers file
387	# from the clauses file
388	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
389	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
390	&extract_vocabulary($self);
391
392	# Use the suffix program to generate the phind/phrases file
393	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
394	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
395	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
396
397	# check that we generated some files. It's not necessarily an error if
398	# we didn't (execute() would have quit on error), but we can't go on.
399	my $phrasesfile=&FileUtils::filenameConcatenate($self->{'phinddir'}, 'phrases');
400	if (! -r $phrasesfile) {
401	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
402	print $out "\nNo phrases found for Phind classifier!\n";
403	return;
404	}
405
406	# Create the phrase file and put phrase numbers in phind/phrases
407	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
408	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
409	&renumber_phrases($self);
410
411	print $out "\nCreating phrase databases\n";
412	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
413	my $mg_input = &FileUtils::filenameConcatenate($phinddir, "pdata.txt");
414	my $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pdata");
415
416	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
417	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
418	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
419
420	# create the mg index of words
421	print $out "\nCreating word-level search indexes\n";
422	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
423	$mg_input = &FileUtils::filenameConcatenate($phinddir, "pword.txt");
424	$mg_stem = &FileUtils::filenameConcatenate($phinddir, "pword");
425
426	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
427	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
428	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
429	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
430	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
431	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
432
433	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
434	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
435	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
436
437	# create the mg document information database
438	print $out "\nCreating document information databases\n";
439	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
440	$mg_input = &FileUtils::filenameConcatenate($phinddir, "docs.txt");
441	$mg_stem = &FileUtils::filenameConcatenate($phinddir, "docs");
442
443	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
444	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
445	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
446
447	my $parameters = "phindnumber=$self->{'phindnumber'}";
448	if (defined ($self->{'textorientation'})) {
449	$parameters .= ";textorientation=$self->{'textorientation'}";
450	}
451	# Return the information about the classifier that we'll later want to
452	# use to create macros when the Phind classifier document is displayed.
453	my %classifyinfo = ('thistype'=>'Invisible',
454	'childtype'=>'Phind',
455	'Title'=>$self->{'buttonname'},
456	'parameters'=>$parameters,
457	'contains'=>[]);
458
459	my $collection = $self->{'collection'};
460	my $url = "library?a=p&p=phind&c=$collection";
461	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
462
463	return \%classifyinfo;
464	}
465
466
467
468	sub convert_gml_to_tokens {
469
470	my ($language_exp, $text) = @_;
471
472	# escape any magic words... - jrm21
473	foreach my $delim (@delimiters) {
474	my $replacement=lc($delim);
475	my $num= $text=~ s/$delim/$replacement/g;
476	if (!$num) {$num=0;}
477	}
478
479	if ($language_exp =~ /^en$/) {
480	return &convert_gml_to_tokens_EN($text);
481	}
482
483	if ($language_exp =~ /zh/) {
484	return &convert_gml_to_tokens_ZH($text);
485	}
486
487	$_ = $text;
488
489	# 1. remove GML tags
490
491	# Remove everything that is in a tag
492	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
493	s/\s<br>\s/ LINEBREAK /isgo;
494	s/<[^>]*>/ /sgo;
495
496	# Now we have the text, but it may contain HTML
497	# elements coded as > etc. Remove these tags.
498	s/&/&/sgo;
499	s/</</sgo;
500	s/>/>/sgo;
501	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
502	s/\s<br>\s/ LINEBREAK /isgo;
503	s/<[^>]*>/ /sgo;
504
505	# replace<p> and <br> placeholders with clause break symbol (\n)
506	s/\s+/ /gso;
507	s/PARAGRAPHBREAK/\n/sgo;
508	s/LINEBREAK/\n/sgo;
509
510
511	# 2. Split the remaining text into space-delimited tokens
512
513	# Convert any HTML special characters (like ") to their UTF8 equivalent
514	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
515
516	# Split text at word boundaries
517	s/\b/ /go;
518
519	# 3. Convert the remaining text to "clause format"
520
521	# Insert newline if the end of a sentence is detected
522	# (delimter is: "[\.\?\!]\s")
523	# s/\s*[\.\?\!]\s+/\n/go;
524
525	# remove unnecessary punctuation and replace with clause break symbol (\n)
526	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
527	#s/[^\w ]/\n/go;
528	# replace punct with new lines - is this what we want??
529	s/\s[\?\;\:\!\,\.\"\[\]\{\}]\s/\n/go; #"
530	# then remove other punct with space
531	s/[\'\`\\\_]/ /go;
532
533	# remove extraneous whitespace
534	s/ +/ /sgo;
535	s/^\s+//mgo;
536	s/\s*$/\n/mgo;
537
538	# remove lines that contain one word or less
539	s/^\S*$//mgo;
540	s/^\s*$//mgo;
541	tr/\n//s;
542
543	return $_;
544	}
545
546	# a chinese version
547	sub convert_gml_to_tokens_ZH {
548
549	$_ = shift @_;
550
551	# Replace all whitespace with a simple space
552	s/\s+/ /gs;
553	# Remove everything that is in a tag
554	s/\s<p>\s/ PARAGRAPHBREAK /isg;
555	s/\s<br>\s/ LINEBREAK /isg;
556	s/<[^>]*>/ /sg;
557
558	# Now we have the text, but it may contain HTML
559	# elements coded as > etc. Remove these tags.
560	s/</</sg;
561	s/>/>/sg;
562
563	s/\s+/ /sg;
564	s/\s<p>\s/ PARAGRAPHBREAK /isg;
565	s/\s<br>\s/ LINEBREAK /isg;
566	s/<[^>]*>/ /sg;
567
568	# remove & and other miscellaneous markup tags
569	s/&/&/sg;
570	s/</</sg;
571	s/>/>/sg;
572	s/&/&/sg;
573
574	# replace<p> and <br> placeholders with carriage returns
575	s/PARAGRAPHBREAK/\n/sg;
576	s/LINEBREAK/\n/sg;
577
578
579	# print STDERR "text:$_\n";
580	return $_;
581	}
582
583	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
584
585	sub convert_gml_to_tokens_EN {
586	$_ = shift @_;
587
588	# FIRST, remove GML tags
589
590	# Replace all whitespace with a simple space
591	s/\s+/ /gs;
592
593	# Remove everything that is in a tag
594	s/\s<p>\s/ PARAGRAPHBREAK /isg;
595	s/\s<br>\s/ LINEBREAK /isg;
596	s/<[^>]*>/ /sg;
597
598	# Now we have the text, but it may contain HTML
599	# elements coded as > etc. Remove these tags.
600	s/</</sg;
601	s/>/>/sg;
602
603	s/\s+/ /sg;
604	s/\s<p>\s/ PARAGRAPHBREAK /isg;
605	s/\s<br>\s/ LINEBREAK /isg;
606	s/<[^>]*>/ /sg;
607
608	# remove & and other miscellaneous markup tags
609	s/&/&/sg;
610	s/</</sg;
611	s/>/>/sg;
612	s/&/&/sg;
613
614	# replace<p> and <br> placeholders with carriage returns
615	s/PARAGRAPHBREAK/\n/sg;
616	s/LINEBREAK/\n/sg;
617
618
619	# Exceptional punctuation
620	#
621	# We make special cases of some punctuation
622
623	# remove any apostrophe that indicates omitted letters
624	s/(\w+)\'(\w*\s)/ $1$2 /g;
625
626	# remove period that appears in a person's initals
627	s/\s([A-Z])\./ $1 /g;
628
629	# replace hyphens in hypheanted words and names with a space
630	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
631
632	# Convert the remaining text to "clause format",
633	# This means removing all excess punctuation and garbage text,
634	# normalising valid punctuation to fullstops and commas,
635	# then putting one cluse on each line.
636
637	# Insert newline when the end of a sentence is detected
638	# (delimter is: "[\.\?\!]\s")
639	s/\s*[\.\?\!]\s+/\n/g;
640
641	# split numbers after four digits
642	s/(\d\d\d\d)/$1 /g;
643
644	# split words after 32 characters
645
646	# squash repeated punctuation
647	tr/A-Za-z0-9 //cs;
648
649	# save email addresses
650	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
651
652	# normalise clause breaks (mostly punctuation symbols) to commas
653	s/[^A-Za-z0-9 \n]+/ , /g;
654
655	# Remove repeated commas, and replace with newline
656	s/\s*,[, ]+/\n/g;
657
658	# remove extra whitespace
659	s/ +/ /sg;
660	s/^\s+//mg;
661	s/\s*$/\n/mg;
662
663	# remove lines that contain one word or less
664	s/^\w*$//mg;
665	s/^\s*$//mg;
666	tr/\n//s;
667
668	return $_;
669
670	}
671
672
673
674	# Execute a system command
675
676	sub execute {
677	my ($command, $verbosity, $outhandle) = @_;
678	print $outhandle "Executing: $command\n" if ($verbosity > 2);
679	$! = 0;
680	my $status = system($command);
681	if ($status != 0) {
682	print STDERR "Phind - Error executing '$command': $!\n";
683	exit($status); # this causes the build to fail...
684	}
685	}
686
687
688	# Generate the vocabulary, symbol statistics, and numbers file from the
689	# clauses file. This is legacy code, so is a bit messy and probably wont
690	# run under windows.
691
692	sub extract_vocabulary {
693	my ($self) = @_;
694
695	my $verbosity = $self->{'verbosity'};
696	my $out = $self->{'outhandle'};
697
698	my $collectiondir = $self->{'collectiondir'};
699	my $phinddir = $self->{'phinddir'};
700
701	my $language_exp = $self->{'language'};
702
703	my ($w, $l, $line, $word);
704
705	my ($first_delimiter, $last_delimiter,
706	$first_stopword, $last_stopword,
707	$first_extractword, $last_extractword,
708	$first_contentword, $last_contentword,
709	$phrasedelimiter);
710
711	my $thesaurus = $self->{'thesaurus'};
712	my ($thesaurus_links, $thesaurus_terms,
713	%thesaurus, $first_thesaurusword, $last_thesaurusword);
714
715	my %symbol;
716	my (%freq);
717
718	print $out "Calculating vocabulary\n" if ($verbosity > 1);
719
720	# Read and store the stopwords
721	my $stopdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
722	my $stopword_files = ();
723	my ($language, $language_dir, $file, $file_name);
724	my %stopwords;
725
726	# Examine each directory in the stopword directory
727	opendir(STOPDIR, $stopdir);
728	foreach $language (readdir STOPDIR) {
729
730	# Ignore entries that do not match the classifier's language
731	next unless ($language =~ /$language_exp/);
732	$language_dir = &FileUtils::filenameConcatenate($stopdir, $language);
733	next unless (-d "$language_dir");
734
735	opendir(LANGDIR, $language_dir);
736	foreach $file (readdir LANGDIR) {
737
738	# Ignore entries that are not stopword files
739	next unless ($file =~ /sw$/);
740	$file_name = &FileUtils::filenameConcatenate($language_dir, $file);
741	next unless (-f "$file_name");
742
743	# Read the stopwords
744	open(STOPFILE, "<$file_name");
745	while (<STOPFILE>) {
746	s/^\s+//;
747	s/\s.*//;
748	$word = $_;
749	$l = lc($word);
750	$stopwords{$l} = $word;
751	}
752	close STOPFILE;
753
754	}
755	closedir LANGDIR;
756	}
757	closedir STOPDIR;
758
759	# Read thesaurus information
760	if ($thesaurus) {
761
762	# link file exists
763	$thesaurus_links = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.lnk");
764	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
765
766	# ensure term file exists in the correct language
767	if ($language_exp =~ /^([a-z][a-z])/) {
768	$language = $1;
769	} else {
770	$language = 'en';
771	}
772	$thesaurus_terms = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.$language");
773	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
774
775
776	# Read the thesaurus terms
777	open(TH, "<$thesaurus_terms");
778	while(<TH>) {
779	s/^\d+ //;
780	s/$.*$//;
781	foreach $w (split(/\s+/, $_)) {
782	$thesaurus{lc($w)} = $w;
783	}
784	}
785	close TH;
786	}
787
788	# Read words in the text and count occurences
789	open(TXT, "<$phinddir/clauses");
790
791	my @words;
792	while(<TXT>) {
793	$line = $_;
794	next unless ($line =~ /./);
795
796	@words = split(/\s+/, $line);
797	foreach $w (@words) {
798	$l = lc($w);
799	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
800	$freq{$w}++;
801	}
802	$freq{$senlimit}++;
803	}
804
805	close TXT;
806
807	# Calculate the "best" form of each word
808	my (%bestform, %totalfreq, %bestfreq);
809
810	foreach $w (sort (keys %freq)) {
811	$l = lc($w);
812
813	# totalfreq is the number of times a term appears in any form
814	$totalfreq{$l} += $freq{$w};
815
816	if (defined $stopwords{$l}) {
817	$bestform{$l} = $stopwords{$l};
818
819	} elsif (defined $thesaurus{$l}) {
820	$bestform{$l} = $thesaurus{$l};
821
822	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
823	$bestfreq{$l} = $freq{$w};
824	$bestform{$l} = $w;
825	}
826	}
827	undef %freq;
828	undef %bestfreq;
829
830
831	# Assign symbol numbers to tokens
832	my $nextsymbol = 1;
833	my (@vocab);
834
835	# Delimiters
836	$first_delimiter = 1;
837
838	foreach $word (@delimiters) {
839
840	# $word = lc($word); # jrm21
841	$word = uc($word);
842	$bestform{$word} = $word;
843	$vocab[$nextsymbol] = $word;
844	$symbol{$word} = $nextsymbol;
845	$nextsymbol++;
846	}
847	$last_delimiter = $nextsymbol - 1;
848	# Stopwords
849	$first_stopword = $nextsymbol;
850
851	foreach my $word (sort keys %stopwords) {
852	# don't include stopword unless it occurs in the text
853	$word = lc($word);
854	next unless ($totalfreq{$word});
855	next if ($symbol{$word});
856
857	$vocab[$nextsymbol] = $word;
858	$symbol{$word} = $nextsymbol;
859	$nextsymbol++;
860	}
861	$last_stopword = $nextsymbol - 1;
862	$first_contentword = $nextsymbol;
863
864	# Thesaurus terms
865	if ($thesaurus) {
866	$first_thesaurusword = $nextsymbol;
867
868	foreach my $word (sort keys %thesaurus) {
869
870	$word = lc($word);
871	next if ($symbol{$word});
872	$bestform{$word} = $thesaurus{$word};
873
874	$vocab[$nextsymbol] = $word;
875	$symbol{$word} = $nextsymbol;
876	$nextsymbol++;
877
878	}
879	$last_thesaurusword = $nextsymbol - 1;
880	}
881
882	# Other content words
883	$first_extractword = $nextsymbol;
884
885	foreach my $word (sort (keys %bestform)) {
886
887	next if ($symbol{$word});
888
889	$vocab[$nextsymbol] = $word;
890	$symbol{$word} = $nextsymbol;
891	$nextsymbol++;
892	}
893	$last_extractword = $nextsymbol - 1;
894	$last_contentword = $nextsymbol - 1;
895
896	# Outut the words
897	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
898	open(VOC, ">$phinddir/clauses.vocab");
899
900	for (my $i = 1; $i < $nextsymbol; $i++) {
901	$w = $vocab[$i];
902
903	print VOC "$bestform{$w}\n";
904	$totalfreq{$w} = 0 unless ($totalfreq{$w});
905	}
906	close VOC;
907
908
909	# Create statistics file
910	# Output statistics about the vocablary
911	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
912	&FileUtils::removeFiles("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
913
914	open(STAT, ">$phinddir/clauses.stats")
915	\|\| die "Cannot open $phinddir/clauses.stats: $!";
916
917	print STAT "first_delimiter $first_delimiter\n";
918	print STAT "last_delimiter $last_delimiter\n";
919	print STAT "first_stopword $first_stopword\n";
920	print STAT "last_stopword $last_stopword\n";
921	if ($thesaurus) {
922	print STAT "first_thesaurusword $first_thesaurusword\n";
923	print STAT "last_thesaurusword $last_thesaurusword\n";
924	}
925	print STAT "first_extractword $first_extractword\n";
926	print STAT "last_extractword $last_extractword\n";
927	print STAT "first_contentword $first_contentword\n";
928	print STAT "last_contentword $last_contentword\n";
929	print STAT "first_symbol $first_delimiter\n";
930	print STAT "last_symbol $last_contentword\n";
931	print STAT "first_word $first_stopword\n";
932	print STAT "last_word $last_contentword\n";
933	close STAT;
934
935	undef @vocab;
936
937
938	# Create numbers file
939	# Save text as symbol numbers
940	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
941
942	open(TXT, "<$phinddir/clauses");
943	open(NUM, ">$phinddir/clauses.numbers");
944
945	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
946	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
947	$phrasedelimiter = $symbol{$senlimit};
948	print NUM "$symbol{$colstart}\n";
949
950	# set up the special symbols that delimit documents and sentences
951	while(<TXT>) {
952
953	# split sentence into a list of tokens
954	$line = $_;
955	next unless ($line =~ /./);
956	@words = split(/\s+/, $line);
957
958	# output one token at a time
959	foreach $word (@words) {
960	# don't lower-case special delimiters - jrm21
961	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
962	$word = lc($word);
963	}
964	print NUM "$symbol{$word}\n";
965	}
966
967	# output phrase delimiter
968	print NUM "$phrasedelimiter\n";
969	}
970
971	close TXT;
972	# print NUM "$symbol{lc($colend)}\n";# jrm21
973	print NUM "$symbol{$colend}\n";
974	close NUM;
975
976	# Save thesaurus data in one convienient file
977	if ($thesaurus) {
978
979	my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
980
981
982	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
983	if ($verbosity > 1);
984
985	# Read the thesaurus terms
986	my ($num, $text, %thes_symbols);
987
988	open(TH, "<$thesaurus_terms");
989	while(<TH>) {
990	chomp;
991	@words = split(/\s+/, $_);
992	$num = shift @words;
993	$text = "";
994
995	# translate words into symbol numbers
996	foreach $word (@words) {
997	$word = lc($word);
998	if ($symbol{$word}) {
999	$text .= "s$symbol{$word} ";
1000	} elsif ($verbosity) {
1001	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1002	}
1003	}
1004	$text =~ s/ $//;
1005	$thes_symbols{$num} = $text;
1006	}
1007	close TH;
1008
1009	# Read the thesaurus links and write the corresponding data
1010	open(TH, "<$thesaurus_links");
1011	open(THOUT, ">$thesaurusfile");
1012
1013	while(<TH>) {
1014	chomp;
1015	($num, $text) = split(/:/, $_);
1016
1017	if (defined($thes_symbols{$num})) {
1018	print THOUT "$num:$thes_symbols{$num}:$text\n";
1019	} else {
1020	print THOUT "$num:untranslated:$text\n";
1021	}
1022	}
1023	close TH;
1024	close THOUT;
1025	}
1026
1027
1028
1029
1030	}
1031
1032
1033	# renumber_phrases
1034	#
1035	# Prepare the phrases file to be input to mgpp. The biggest problem is
1036	# reconciling the phrase identifiers used by the suffix program (which
1037	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1038	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1039	# Phind-id numbers must be sorted by frequency of occurance.
1040	#
1041	# Start creating a set of phind-id numbers from the sorted suffix-id
1042	# numbers and (if required) the thesaurus-id numbers. Then add any other
1043	# phrases occuring in the thesaurus.
1044	#
1045	# The last thing we have to do is restore the vocabulary information to the
1046	# phrase file so that the phrases are stored as words, not as symbol
1047	# numbers.
1048
1049	# The original phrases file looks something like this:
1050	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1051	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1052	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1053	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1054
1055
1056	sub renumber_phrases {
1057	my ($self) = @_;
1058
1059	renumber_suffix_data($self);
1060	renumber_thesaurus_data($self);
1061	restore_vocabulary_data($self);
1062
1063	}
1064
1065
1066
1067	# renumber_suffix_data
1068	#
1069	# Translate phrases file to phrases.2 using phind keys instead
1070	# of suffix keys and sorting the expansion data.
1071
1072	sub renumber_suffix_data {
1073	my ($self) = @_;
1074
1075	my $verbosity = $self->{'verbosity'};
1076	my $out = $self->{'outhandle'};
1077	print $out "Translate phrases: suffix-ids become phind-id's\n"
1078	if ($verbosity);
1079
1080	my $phinddir = $self->{'phinddir'};
1081	my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases');
1082	my $outfile = &FileUtils::filenameConcatenate($phinddir, 'phrases.2');
1083
1084	# Read the phrase file. Calculate initial set of phind-id
1085	# numbers and store (suffixid -> frequency) relation.
1086
1087	my %suffixtophind;
1088	my @totalfrequency;
1089	my (@fields, $suffixid);
1090	my $nextphind = 1;
1091
1092	open(IN, "<$infile");
1093	while(<IN>) {
1094
1095	chomp;
1096	@fields = split(/:/, $_);
1097
1098	# get next suffixid and phindid
1099	$suffixid = shift @fields;
1100	$suffixtophind{$suffixid} = $nextphind;
1101
1102	# store total frequency
1103	shift @fields;
1104	$totalfrequency[$nextphind] = shift @fields;
1105
1106	$nextphind++;
1107	}
1108	close IN;
1109
1110
1111	# Translate phrases file to phrases.2. Use phind keys (not suffix
1112	# keys), sort expansion and document occurance data in order of
1113	# descending frequency..
1114	open(IN, "<$infile");
1115	open(OUT, ">$outfile");
1116
1117	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1118	my (@documwents, @newexp, $k, $n);
1119	my $linenumber = 0;
1120
1121	while(<IN>) {
1122
1123	# read the line
1124	chomp;
1125	@fields = split(/:/, $_);
1126
1127	# get a phrase number for this line
1128	$suffixid = shift @fields;
1129	die unless (defined($suffixtophind{$suffixid}));
1130	$phindid = $suffixtophind{$suffixid};
1131
1132	# get the symbols in the phrase
1133	$text = shift @fields;
1134
1135	# output status information
1136	$linenumber++;
1137	if ($verbosity > 2) {
1138	if ($linenumber % 1000 == 0) {
1139	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1140	}
1141	# what are $num and $key??
1142	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1143	}
1144
1145	# get the phrase frequency
1146	$tf = shift @fields;
1147
1148	# get the number of expansions
1149	$countexp = shift @fields;
1150
1151	# get the expansions, convert them into phind-id numbers, and sort them
1152	$expansions = shift @fields;
1153	@newexp = ();
1154	foreach $k (split(/,/, $expansions)) {
1155	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1156	$n = $suffixtophind{$k};
1157	push @newexp, $n;
1158	}
1159	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1160
1161	# get the number of documents
1162	$countdocs = shift @fields;
1163
1164	# get the documents and sort them
1165	$documents = shift @fields;
1166	$documents =~ s/d//g;
1167	my @documents = split(/;/, $documents);
1168	@documents = sort by_doc_frequency @documents;
1169
1170	# output the phrase data
1171	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1172	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1173
1174	}
1175
1176	close IN;
1177	close OUT;
1178	}
1179
1180
1181	# renumber_thesaurus_data
1182	#
1183	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1184
1185	sub renumber_thesaurus_data {
1186	my ($self) = @_;
1187
1188	my $out = $self->{'outhandle'};
1189	my $verbosity = $self->{'verbosity'};
1190	my $thesaurus = $self->{'thesaurus'};
1191
1192	my $phinddir = $self->{'phinddir'};
1193	my $infile = &FileUtils::filenameConcatenate($phinddir, "phrases.2");
1194	my $outfile = &FileUtils::filenameConcatenate($phinddir, "phrases.3");
1195
1196
1197	# If no thesaurus is defined, simply move the phrases file.
1198	if (!$thesaurus) {
1199	print $out "Translate phrases.2: no thesaurus data\n"
1200	if ($verbosity);
1201	&FileUtils::moveFiles($infile, $outfile);
1202	return;
1203	}
1204
1205	print $out "Translate phrases.2: add thesaurus data\n"
1206	if ($verbosity);
1207
1208	# 1.
1209	# Read thesaurus file and store (symbols->thesaurusid) mapping
1210	my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
1211	my %symbolstothesid;
1212	my (@fields, $thesid, $symbols);
1213
1214	open(TH, "<$thesaurusfile");
1215
1216	while (<TH>) {
1217
1218	chomp;
1219	@fields = split(/:/, $_);
1220
1221	# get id and text
1222	$thesid = shift @fields;
1223	$symbols = shift @fields;
1224	$symbolstothesid{$symbols} = $thesid;
1225	}
1226	close TH;
1227
1228	# 2.
1229	# Read phrases file to find thesaurus entries that already
1230	# have a phindid. Store their phind-ids for later translation,
1231	# and store their frequency for later sorting.
1232	my %thesaurustophindid;
1233	my %phindidtofrequency;
1234	my ($phindid, $freq);
1235
1236	open(IN, "<$infile");
1237
1238	while(<IN>) {
1239
1240	chomp;
1241	@fields = split(/:/, $_);
1242
1243	# phindid and symbols for this line
1244	$phindid = shift @fields;
1245	$symbols = shift @fields;
1246	$freq = shift @fields;
1247
1248	# do we have a thesaurus id corresponding to this phrase?
1249	if (defined($symbolstothesid{$symbols})) {
1250	$thesid = $symbolstothesid{$symbols};
1251	$thesaurustophindid{$thesid} = $phindid;
1252	$phindidtofrequency{$phindid} = $freq;
1253	}
1254	}
1255	close IN;
1256
1257	undef %symbolstothesid;
1258
1259	# 3.
1260	# Create phind-id numbers for remaining thesaurus entries,
1261	# and note that their frequency is 0 for later sorting.
1262	my $nextphindid = $phindid + 1;
1263
1264	open(TH, "<$thesaurusfile");
1265	while(<TH>) {
1266
1267	chomp;
1268	@fields = split(/:/, $_);
1269
1270	# read thesaurus-id and ensure it has a corresponding phind-id
1271	$thesid = shift @fields;
1272	if (!defined($thesaurustophindid{$thesid})) {
1273	$thesaurustophindid{$thesid} = $nextphindid;
1274	$phindidtofrequency{$nextphindid} = 0;
1275	$nextphindid++;
1276	}
1277	}
1278	close TH;
1279
1280	# 4.
1281	# Translate thesaurus file, replacing thesaurus-id numbers with
1282	# phind-id numbers.
1283	my $newthesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.phindid");
1284	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1285	my (@links, $linkid, %linkidtotype, $newrelation);
1286
1287	open(TH, "<$thesaurusfile");
1288	open(TO, ">$newthesaurusfile");
1289	while(<TH>) {
1290
1291	chomp;
1292	@fields = split(/:/, $_);
1293
1294	# phindid and symbols for this line
1295	($thesid, $symbols, $relations) = @fields;
1296
1297	die unless ($thesid && $symbols);
1298	die unless $thesaurustophindid{$thesid};
1299	$phindid = $thesaurustophindid{$thesid};
1300
1301	# convert each part of the relation string to use phind-id numbers
1302	# at the same time, we want to sort the list by frequency.
1303	undef %linkidtotype;
1304
1305	foreach $linktext (split(/;/, $relations)) {
1306	@linkdata = split(/,/, $linktext);
1307
1308	# remember the linktype (e.g. BT, NT)
1309	$linktype = shift @linkdata;
1310
1311	# store the type of each link
1312	foreach $thesid (@linkdata) {
1313	die unless (defined($thesaurustophindid{$thesid}));
1314	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1315	}
1316	}
1317
1318	# sort the list of links, first by frequency, then by type.
1319	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1320	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1321	$linkcounter = (scalar @links);
1322
1323	# create a string describing the link information
1324	$linktype = $linkidtotype{$links[0]};
1325	$newrelation = $linktype;
1326	foreach $linkid (@links) {
1327	if ($linkidtotype{$linkid} ne $linktype) {
1328	$linktype = $linkidtotype{$linkid};
1329	$newrelation .= ";" . $linktype;
1330	}
1331	$newrelation .= "," . $linkid;
1332	}
1333	$newrelation .= ";";
1334
1335
1336	# output the new line
1337	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1338	}
1339	close TH;
1340	close TO;
1341
1342	undef %thesaurustophindid;
1343	undef %linkidtotype;
1344	undef %phindidtofrequency;
1345
1346	# 5.
1347	# Read thesaurus data (in phind-id format) into memory
1348	my %thesaurusdata;
1349
1350	open(TH, "<$newthesaurusfile");
1351	while(<TH>) {
1352	chomp;
1353	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1354	die unless ($phindid && $symbols);
1355	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1356	}
1357	close TH;
1358
1359	# 6.
1360	# Add thesaurus data to phrases file
1361	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1362	my (@documwents, @newexp, $k, $n);
1363	my $linenumber = 0;
1364
1365	open(IN, "<$infile");
1366	open(OUT, ">$outfile");
1367
1368	# Update existing phrases
1369	while(<IN>) {
1370
1371	chomp;
1372	@fields = split(/:/, $_);
1373
1374	# get data for this line
1375	$phindid = shift @fields;
1376
1377	# output the phrase data, with thesaurus information
1378	print OUT "$phindid:", join(":", @fields);
1379
1380	# add thesaurus data
1381	if (defined($thesaurusdata{$phindid})) {
1382	@fields = split(/:/, $thesaurusdata{$phindid});
1383	shift @fields;
1384	$linkcounter = shift @fields;
1385	$relations = shift @fields;
1386
1387	print OUT ":$linkcounter:$relations";
1388	$thesaurusdata{$phindid} = "";
1389	}
1390	print OUT "\n";
1391	}
1392	close IN;
1393
1394	# Add phrases that aren't already in the file
1395	foreach $phindid (sort numerically keys %thesaurusdata) {
1396	next unless ($thesaurusdata{$phindid});
1397
1398	@fields = split(/:/, $thesaurusdata{$phindid});
1399	$symbols = shift @fields;
1400	$linkcounter = shift @fields;
1401	$relations = shift @fields;
1402
1403	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1404	}
1405	close OUT;
1406
1407	}
1408
1409	# restore_vocabulary_data
1410	#
1411	# Read phrases.3 and restore vocabulary information. Then write
1412	# this data to the MGPP input files (pword.txt and pdata.txt) and
1413	# (if requested) to the saved phrases file.
1414
1415	sub restore_vocabulary_data {
1416	my ($self) = @_;
1417
1418	my $out = $self->{'outhandle'};
1419	my $verbosity = $self->{'verbosity'};
1420	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1421
1422	my $phinddir = $self->{'phinddir'};
1423	my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases.3');
1424	my $vocabfile = &FileUtils::filenameConcatenate($phinddir, 'clauses.vocab');
1425	my $datafile = &FileUtils::filenameConcatenate($phinddir, 'pdata.txt');
1426	my $wordfile = &FileUtils::filenameConcatenate($phinddir, 'pword.txt');
1427
1428	my $savephrases = $self->{'savephrases'};
1429
1430	# 1.
1431	# Read the vocabulary file
1432	open(V, "<$vocabfile")
1433	\|\| die "Cannot open $vocabfile: $!";
1434	my @symbol;
1435	my $i = 1;
1436	while(<V>) {
1437	chomp;
1438	$symbol[$i++] = $_;
1439	}
1440	close V;
1441
1442	# 2.
1443	# Translate phrases.3 to MGPP input files
1444	my ($key, $text, $word, $isThesaurus, $line);
1445	my @fields;
1446	my $linenumber = 0;
1447
1448	open(IN, "<$infile");
1449	open(DATA, ">$datafile");
1450	open(WORD, ">$wordfile");
1451
1452	# Save the phrases in a separate text file
1453	if ($savephrases) {
1454	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1455	open(SAVE, ">$savephrases");
1456	}
1457
1458	while(<IN>) {
1459
1460	# read the line
1461	chomp;
1462	$line = $_;
1463	@fields = split(/:/, $line);
1464
1465	# get a phrase number for this line
1466	$key = shift @fields;
1467
1468	# restore the text of the phrase
1469	$text = shift @fields;
1470	$text =~ s/s(\d+)/$symbol[$1]/g;
1471	if ($text =~ / /) {
1472	$word = "";
1473	} elsif ($text ne 'untranslated') {
1474	$word = $text;
1475	}
1476
1477	# output the phrase data
1478	print DATA "<Document>";
1479	print DATA "$key:$text:", join(":", @fields), ":\n";
1480
1481	# output the word index search data
1482	print WORD "<Document>$word\n";
1483
1484	# output the phrases to a text file
1485	if ($savephrases) {
1486	if ((scalar @fields) == 7) {
1487	$isThesaurus = 1;
1488	} else {
1489	$isThesaurus = 0;
1490	}
1491	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1492	}
1493	}
1494	close IN;
1495	close WORD;
1496	close DATA;
1497	close SAVE if ($savephrases);
1498
1499	}
1500
1501
1502
1503	# sort routines used to renumber phrases
1504
1505	sub numerically { $a <=> $b }
1506
1507	sub by_doc_frequency {
1508	my $fa = 1;
1509	if ($a =~ /,/) {
1510	$fa = $a;
1511	$fa =~ s/\d+,//;
1512	}
1513	my $fb = 1;
1514	if ($b =~ /,/) {
1515	$fb = $b;
1516	$fb =~ s/\d+,//;
1517	}
1518
1519	return ($fb <=> $fa);
1520	}
1521
1522	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm

Download in other formats: