Context Navigation

source: trunk/gsdl/perllib/classify/Phind.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago
Jeffrey's new parsing modifications, committed approx 6 July, 15.16
Property svn:keywords set to `Author Date Id Revision`
File size: 40.0 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Type "classinfo.pl Phind" at the command line for a summary.
31
32	# 12/05/02 Added usage datastructure - John Thompson
33
34	package Phind;
35
36	use BasClas;
37	use util;
38	use ghtml;
39	use unicode;
40
41	my @removedirs = ();
42
43	my %wanted_index_files = ('td'=>1,
44	't'=>1,
45	'ti'=>1,
46	'tl'=>1,
47	'tsd'=>1,
48	'idb'=>1,
49	'ib1'=>1,
50	'ib2'=>1,
51	'ib3'=>1,
52	'i'=>1,
53	'il'=>1,
54	'w'=>1,
55	'wa'=>1);
56
57	sub BEGIN {
58	@ISA = ('BasClas');
59	}
60
61	sub END {
62
63	# Tidy up stray files - we do this here as there's some weird problem
64	# preventing us from doing it in the get_classify_info() function (on
65	# windows at least) where the close() appears to fail on txthandle and
66	# dochandle, thus preventing us from deleting those files
67
68	foreach my $dir (@removedirs) {
69	if (-d $dir && opendir (DIR, $dir)) {
70	my @files = readdir DIR;
71	closedir DIR;
72
73	foreach $file (@files) {
74	next if $file =~ /^\.\.?$/;
75	my ($suffix) = $file =~ /\.([^\.]+)$/;
76	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
77	# delete it!
78	&util::rm (&util::filename_cat ($dir, $file));
79	}
80	}
81	}
82	}
83	}
84
85	my $arguments =
86	[ { 'name' => "text",
87	'desc' => "{Phind.text}",
88	'type' => "string",
89	'deft' => "section:Title,section:text",
90	'reqd' => "no" },
91	{ 'name' => "title",
92	'desc' => "{Phind.title}",
93	'type' => "metadata",
94	'deft' => "Title",
95	'reqd' => "no" },
96	{ 'name' => "buttonname",
97	'desc' => "{BasClas.buttonname}",
98	'type' => "string",
99	'deft' => "Phrase",
100	'reqd' => "no" },
101	{ 'name' => "language",
102	'desc' => "{Phind.language}",
103	'type' => "string",
104	'deft' => "en",
105	'reqd' => "no" },
106	{ 'name' => "savephrases",
107	'desc' => "{Phind.savephrases}",
108	'type' => "string",
109	'deft' => "",
110	'reqd' => "no" },
111	{ 'name' => "suffixmode",
112	'desc' => "{Phind.suffixmode}",
113	'type' => "int",
114	'deft' => "1",
115	'range' => "0,1",
116	'reqd' => "no" },
117	{ 'name' => "min_occurs",
118	'desc' => "{Phind.min_occurs}",
119	'type' => "int",
120	'deft' => "2",
121	'range' => "1,",
122	'reqd' => "no" },
123	{ 'name' => "thesaurus",
124	'desc' => "{Phind.thesaurus}",
125	'type' => "string",
126	'deft' => "",
127	'reqd' => "no" },
128	{ 'name' => "untidy",
129	'desc' => "{Phind.untidy}",
130	'type' => "flag",
131	'reqd' => "no" } ];
132
133	my $options = { 'name' => "Phind",
134	'desc' => "{Phind.desc}",
135	'abstract' => "no",
136	'inherits' => "yes",
137	'args' => $arguments };
138
139
140	# Phrase delimiter symbols - these should be abstracted out someplace
141
142	my $colstart = "COLLECTIONSTART";
143	my $colend = "COLLECTIONEND";
144	my $doclimit = "DOCUMENTLIMIT";
145	my $senlimit = "SENTENCELIMIT";
146	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
147
148
149	# Create a new Phind browser based on collect.cfg
150
151	sub new {
152	my ($class) = shift (@_);
153	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
154	push(@$classifierslist, $class);
155
156	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
157	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
158
159	my $self = (defined $hashArgOptLists)? new BasClas($classifierslist,$inputargs,$hashArgOptLists): new BasClas($classifierslist,$inputargs);
160
161	# Ensure the Phind generate scripts are in place
162	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
163	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
164	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
165	if (!(-e $file1)) {
166	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
167	exit(1);
168	}
169
170	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
171	$self->{"indexes"} = $self->{"text"};
172
173	# Further setup
174	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
175	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
176	if (! defined $self->{'builddir'}) {
177	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
178	}
179	$self->{'total'} = 0;
180
181	# Clean out the unused keys
182	delete $self->{"text"};
183
184	return bless $self, $class;
185	}
186
187
188	# Initialise the Phind classifier
189
190	sub init {
191	my $self = shift (@_);
192
193	# ensure we have a build directory
194	my $builddir = $self->{'builddir'};
195	die unless (-e "$builddir");
196
197	# create Phind directory
198	my $phnumber = 1;
199	my $phinddir = &util::filename_cat($builddir, "phind1");
200	while (-e "$phinddir") {
201	$phnumber++;
202	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
203	}
204	&util::mk_dir("$phinddir");
205	$self->{'phinddir'} = $phinddir;
206	$self->{'phindnumber'} = $phnumber;
207
208	push(@removedirs, $phinddir) unless $self->{'untidy'};
209
210	# open filehandles for documents and text
211	my $clausefile = &util::filename_cat("$phinddir", "clauses");
212	&util::rm($clausefile) if (-e $clausefile);
213
214	my $txthandle = 'TEXT' . $phnumber;
215	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
216	$self->{'txthandle'} = $txthandle;
217
218	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
219	&util::rm($docfile) if (-e $docfile);
220
221	my $dochandle = 'DOC' . $phnumber;
222	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
223	$self->{'dochandle'} = $dochandle;
224
225	}
226
227
228	# Classify each document.
229	#
230	# Each document is passed here in turn. The classifier extracts the
231	# text of each and stores it in the clauses file. Document details are
232	# stored in the docs.txt file.
233
234	sub classify {
235	my $self = shift (@_);
236	my $doc_obj = shift @_;
237
238	my $verbosity = $self->{'verbosity'};
239	my $top_section = $doc_obj->get_top_section();
240
241	my $titlefield = $self->{'title'};
242
243	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
244	if (!defined($title)) {
245	$title = "";
246	print STDERR "Phind: document has no title\n";
247	}
248	print "process: $title\n" if ($verbosity > 2);
249
250	# Only consider the file if it is in the correct language
251	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
252	my $phrlanguage = $self->{'language'};
253	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
254
255	# record this file
256	$self->{'total'} ++;
257	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
258
259	# Store document details
260	my $OID = $doc_obj->get_OID();
261	$OID = "NULL" unless defined $OID;
262	my $dochandle = $self->{'dochandle'};
263	print $dochandle "<Document>\t$OID\t$title\n";
264
265	# Store the text occuring in this object
266
267	# output the document delimiter
268	my $txthandle = $self->{'txthandle'};
269	print $txthandle "$doclimit\n";
270
271	# iterate over the required indexes and store their text
272	my $indexes = $self->{'indexes'};
273	my $text = "";
274	my ($part, $level, $field, $section, $data, $dataref);
275
276	foreach $part (split(/,/, $indexes)) {
277
278	# Each field has a level and a data element ((e.g. document:Title)
279	($level, $field) = split(/:/, $part);
280	die unless ($level && $field);
281
282	# Extract the text from every section
283	# (In phind, document:text and section:text are equivalent)
284	if ($field eq "text") {
285	$data = "";
286	$section = $doc_obj->get_top_section();
287	while (defined($section)) {
288	$data .= $doc_obj->get_text($section) . "\n";
289	$section = $doc_obj->get_next_section($section);
290	}
291	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
292	}
293
294	# Extract a metadata field from a document
295	# (If there is more than one element of the given type, get them all.)
296	elsif ($level eq "document") {
297	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
298	foreach $data (@$dataref) {
299	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
300	}
301	}
302
303	# Extract metadata from every section in a document
304	elsif ($level eq "section") {
305	$data = "";
306	$section = $doc_obj->get_top_section();
307	while (defined($section)) {
308	$dataref = $doc_obj->get_metadata($section, $field);
309	$data .= join("\n", @$dataref) . "\n";
310	$section = $doc_obj->get_next_section($section);
311	}
312	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
313	}
314
315	# Some sort of specification which I don't understand
316	else {
317	die "Unknown level ($level) in Phind index ($part)\n";
318	}
319
320	}
321
322	# output the text
323	$text =~ tr/\n//s;
324	print $txthandle "$text";
325	}
326
327
328	# Construct the classifier from the information already gathered
329	#
330	# When get_classify_info is called, the clauses and docs.txt files have
331	# already been constructed in the Phind directory. This function will
332	# translate them into compressed, indexed MGPP files that can be read by
333	# the phindcgi script. It will also register our classifier so that it
334	# shows up in the navigation bar.
335
336	sub get_classify_info {
337	my $self = shift (@_);
338	my ($gli) = @_;
339
340	close $self->{'dochandle'};
341	close $self->{'txthandle'};
342	my $verbosity = $self->{'verbosity'};
343	my $out = $self->{'outhandle'};
344	my $phinddir = $self->{'phinddir'};
345
346	my $osextra = "";
347	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
348	$osextra = " -d /";
349	}
350
351	print STDERR "</Stage>\n" if $gli;
352
353	if ($verbosity) {
354	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
355	print $out "*** in ", $self->{'phinddir'}, "\n";
356	}
357
358	print STDERR "<Stage name='Phind'>\n" if $gli;
359
360	# Construct phind indexes
361	my $suffixmode = $self->{'suffixmode'};
362	my $min_occurs = $self->{'min_occurs'};
363	my ($command, $status);
364
365	# Generate the vocabulary, symbol statistics, and numbers file
366	# from the clauses file
367	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
368	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
369	&extract_vocabulary($self);
370
371	# Use the suffix program to generate the phind/phrases file
372	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
373	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
374	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
375
376	# check that we generated some files. It's not necessarily an error if
377	# we didn't (execute() would have quit on error), but we can't go on.
378	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
379	if (! -r $phrasesfile) {
380	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
381	print $out "\nNo phrases found for Phind classifier!\n";
382	return;
383	}
384
385	# Create the phrase file and put phrase numbers in phind/phrases
386	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
387	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
388	&renumber_phrases($self);
389
390	print $out "\nCreating phrase databases\n";
391	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
392	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
393	my $mg_stem = &util::filename_cat($phinddir, "pdata");
394
395	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
396	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
397	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
398
399	# create the mg index of words
400	print $out "\nCreating word-level search indexes\n";
401	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
402	$mg_input = &util::filename_cat($phinddir, "pword.txt");
403	$mg_stem = &util::filename_cat($phinddir, "pword");
404
405	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
406	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
407	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
408	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
409	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
410	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
411
412	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
413	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
414	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
415
416	# create the mg document information database
417	print $out "\nCreating document information databases\n";
418	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
419	$mg_input = &util::filename_cat($phinddir, "docs.txt");
420	$mg_stem = &util::filename_cat($phinddir, "docs");
421
422	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
423	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
424	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
425
426	# Return the information about the classifier that we'll later want to
427	# use to create macros when the Phind classifier document is displayed.
428	my %classifyinfo = ('thistype'=>'Invisible',
429	'childtype'=>'Phind',
430	'Title'=>$self->{'buttonname'},
431	'parameters'=>"phindnumber=$self->{'phindnumber'}",
432	'contains'=>[]);
433
434	my $collection = $self->{'collection'};
435	my $url = "library?a=p&p=phind&c=$collection";
436	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
437
438	return \%classifyinfo;
439	}
440
441
442
443	sub convert_gml_to_tokens {
444
445	my ($language_exp, $text) = @_;
446
447	# escape any magic words... - jrm21
448	foreach my $delim (@delimiters) {
449	my $replacement=lc($delim);
450	my $num= $text=~ s/$delim/$replacement/g;
451	if (!$num) {$num=0;}
452	}
453
454	if ($language_exp =~ /en/) {
455	return &convert_gml_to_tokens_EN($text);
456	}
457
458	if ($language_exp =~ /zh/) {
459	return &convert_gml_to_tokens_ZH($text);
460	}
461
462	$_ = $text;
463
464	# 1. remove GML tags
465
466	# Remove everything that is in a tag
467	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
468	s/\s<br>\s/ LINEBREAK /isgo;
469	s/<[^>]*>/ /sgo;
470
471	# Now we have the text, but it may contain HTML
472	# elements coded as > etc. Remove these tags.
473	s/&/&/sgo;
474	s/</</sgo;
475	s/>/>/sgo;
476	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
477	s/\s<br>\s/ LINEBREAK /isgo;
478	s/<[^>]*>/ /sgo;
479
480	# replace<p> and <br> placeholders with clause break symbol (\n)
481	s/\s+/ /gso;
482	s/PARAGRAPHBREAK/\n/sgo;
483	s/LINEBREAK/\n/sgo;
484
485
486
487
488	# 2. Split the remaining text into space-delimited tokens
489
490	# Convert any HTML special characters (like ") to their UTF8 equivalent
491	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
492
493	# Split text at word boundaries
494	s/\b/ /go;
495
496	# 3. Convert the remaining text to "clause format"
497
498	# Insert newline if the end of a sentence is detected
499	# (delimter is: "[\.\?\!]\s")
500	# s/\s*[\.\?\!]\s+/\n/go;
501
502	# remove unnecessary punctuation and replace with clause break symbol (\n)
503	s/[^\w ]/\n/go;
504
505	# remove extraneous whitespace
506	s/ +/ /sgo;
507	s/^\s+//mgo;
508	s/\s*$/\n/mgo;
509
510	# remove lines that contain one word or less
511	s/^\S*$//mgo;
512	s/^\s*$//mgo;
513	tr/\n//s;
514
515	return $_;
516	}
517
518	# a chinese version
519	sub convert_gml_to_tokens_ZH {
520
521	$_ = shift @_;
522
523	# Replace all whitespace with a simple space
524	s/\s+/ /gs;
525	# Remove everything that is in a tag
526	s/\s<p>\s/ PARAGRAPHBREAK /isg;
527	s/\s<br>\s/ LINEBREAK /isg;
528	s/<[^>]*>/ /sg;
529
530	# Now we have the text, but it may contain HTML
531	# elements coded as > etc. Remove these tags.
532	s/</</sg;
533	s/>/>/sg;
534
535	s/\s+/ /sg;
536	s/\s<p>\s/ PARAGRAPHBREAK /isg;
537	s/\s<br>\s/ LINEBREAK /isg;
538	s/<[^>]*>/ /sg;
539
540	# remove & and other miscellaneous markup tags
541	s/&/&/sg;
542	s/</</sg;
543	s/>/>/sg;
544	s/&/&/sg;
545
546	# replace<p> and <br> placeholders with carriage returns
547	s/PARAGRAPHBREAK/\n/sg;
548	s/LINEBREAK/\n/sg;
549
550
551	# print STDERR "text:$_\n";
552	return $_;
553	}
554
555	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
556
557	sub convert_gml_to_tokens_EN {
558	$_ = shift @_;
559
560	# FIRST, remove GML tags
561
562	# Replace all whitespace with a simple space
563	s/\s+/ /gs;
564
565	# Remove everything that is in a tag
566	s/\s<p>\s/ PARAGRAPHBREAK /isg;
567	s/\s<br>\s/ LINEBREAK /isg;
568	s/<[^>]*>/ /sg;
569
570	# Now we have the text, but it may contain HTML
571	# elements coded as > etc. Remove these tags.
572	s/</</sg;
573	s/>/>/sg;
574
575	s/\s+/ /sg;
576	s/\s<p>\s/ PARAGRAPHBREAK /isg;
577	s/\s<br>\s/ LINEBREAK /isg;
578	s/<[^>]*>/ /sg;
579
580	# remove & and other miscellaneous markup tags
581	s/&/&/sg;
582	s/</</sg;
583	s/>/>/sg;
584	s/&/&/sg;
585
586	# replace<p> and <br> placeholders with carriage returns
587	s/PARAGRAPHBREAK/\n/sg;
588	s/LINEBREAK/\n/sg;
589
590
591	# Exceptional punctuation
592	#
593	# We make special cases of some punctuation
594
595	# remove any apostrophe that indicates omitted letters
596	s/(\w+)\'(\w*\s)/ $1$2 /g;
597
598	# remove period that appears in a person's initals
599	s/\s([A-Z])\./ $1 /g;
600
601	# replace hyphens in hypheanted words and names with a space
602	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
603
604	# Convert the remaining text to "clause format",
605	# This means removing all excess punctuation and garbage text,
606	# normalising valid punctuation to fullstops and commas,
607	# then putting one cluse on each line.
608
609	# Insert newline when the end of a sentence is detected
610	# (delimter is: "[\.\?\!]\s")
611	s/\s*[\.\?\!]\s+/\n/g;
612
613	# split numbers after four digits
614	s/(\d\d\d\d)/$1 /g;
615
616	# split words after 32 characters
617
618	# squash repeated punctuation
619	tr/A-Za-z0-9 //cs;
620
621	# save email addresses
622	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
623
624	# normalise clause breaks (mostly punctuation symbols) to commas
625	s/[^A-Za-z0-9 \n]+/ , /g;
626
627	# Remove repeated commas, and replace with newline
628	s/\s*,[, ]+/\n/g;
629
630	# remove extra whitespace
631	s/ +/ /sg;
632	s/^\s+//mg;
633	s/\s*$/\n/mg;
634
635	# remove lines that contain one word or less
636	s/^\w*$//mg;
637	s/^\s*$//mg;
638	tr/\n//s;
639
640	return $_;
641
642	}
643
644
645
646	# Execute a system command
647
648	sub execute {
649	my ($command, $verbosity, $outhandle) = @_;
650	print $outhandle "Executing: $command\n" if ($verbosity > 2);
651	$! = 0;
652	my $status = system($command);
653	if ($status != 0) {
654	print STDERR "Phind - Error executing '$command': $!\n";
655	exit($status); # this causes the build to fail...
656	}
657	}
658
659
660	# Generate the vocabulary, symbol statistics, and numbers file from the
661	# clauses file. This is legacy code, so is a bit messy and probably wont
662	# run under windows.
663
664	sub extract_vocabulary {
665	my ($self) = @_;
666
667	my $verbosity = $self->{'verbosity'};
668	my $out = $self->{'outhandle'};
669
670	my $collectiondir = $self->{'collectiondir'};
671	my $phinddir = $self->{'phinddir'};
672
673	my $language_exp = $self->{'language'};
674
675	my ($w, $l, $line, $word);
676
677	my ($first_delimiter, $last_delimiter,
678	$first_stopword, $last_stopword,
679	$first_extractword, $last_extractword,
680	$first_contentword, $last_contentword,
681	$phrasedelimiter);
682
683	my $thesaurus = $self->{'thesaurus'};
684	my ($thesaurus_links, $thesaurus_terms,
685	%thesaurus, $first_thesaurusword, $last_thesaurusword);
686
687	my %symbol;
688	my (%freq);
689
690	print $out "Calculating vocabulary\n" if ($verbosity > 1);
691
692	# Read and store the stopwords
693	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
694	my $stopword_files = ();
695	my ($language, $language_dir, $file, $file_name);
696	my %stopwords;
697
698	# Examine each directory in the stopword directory
699	opendir(STOPDIR, $stopdir);
700	foreach $language (readdir STOPDIR) {
701
702	# Ignore entries that do not match the classifier's language
703	next unless ($language =~ /$language_exp/);
704	$language_dir = &util::filename_cat($stopdir, $language);
705	next unless (-d "$language_dir");
706
707	opendir(LANGDIR, $language_dir);
708	foreach $file (readdir LANGDIR) {
709
710	# Ignore entries that are not stopword files
711	next unless ($file =~ /sw$/);
712	$file_name = &util::filename_cat($language_dir, $file);
713	next unless (-f "$file_name");
714
715	# Read the stopwords
716	open(STOPFILE, "<$file_name");
717	while (<STOPFILE>) {
718	s/^\s+//;
719	s/\s.*//;
720	$word = $_;
721	$l = lc($word);
722	$stopwords{$l} = $word;
723	}
724	close STOPFILE;
725
726	}
727	closedir LANGDIR;
728	}
729	closedir STOPDIR;
730
731	# Read thesaurus information
732	if ($thesaurus) {
733
734	# link file exists
735	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
736	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
737
738	# ensure term file exists in the correct language
739	if ($language_exp =~ /^([a-z][a-z])/) {
740	$language = $1;
741	} else {
742	$language = 'en';
743	}
744	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
745	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
746
747
748	# Read the thesaurus terms
749	open(TH, "<$thesaurus_terms");
750	while(<TH>) {
751	s/^\d+ //;
752	s/\(.*\)//;
753	foreach $w (split(/\s+/, $_)) {
754	$thesaurus{lc($w)} = $w;
755	}
756	}
757	close TH;
758	}
759
760	# Read words in the text and count occurences
761	open(TXT, "<$phinddir/clauses");
762
763	my @words;
764	while(<TXT>) {
765	$line = $_;
766	next unless ($line =~ /./);
767
768	@words = split(/\s+/, $line);
769	foreach $w (@words) {
770	$l = lc($w);
771	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
772	$freq{$w}++;
773	}
774	$freq{$senlimit}++;
775	}
776
777	close TXT;
778
779	# Calculate the "best" form of each word
780	my (%bestform, %totalfreq, %bestfreq);
781
782	foreach $w (sort (keys %freq)) {
783	$l = lc($w);
784
785	# totalfreq is the number of times a term appears in any form
786	$totalfreq{$l} += $freq{$w};
787
788	if (defined $stopwords{$l}) {
789	$bestform{$l} = $stopwords{$l};
790
791	} elsif (defined $thesaurus{$l}) {
792	$bestform{$l} = $thesaurus{$l};
793
794	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
795	$bestfreq{$l} = $freq{$w};
796	$bestform{$l} = $w;
797	}
798	}
799	undef %freq;
800	undef %bestfreq;
801
802
803	# Assign symbol numbers to tokens
804	my $nextsymbol = 1;
805	my (@vocab);
806
807	# Delimiters
808	$first_delimiter = 1;
809
810	foreach $word (@delimiters) {
811
812	# $word = lc($word); # jrm21
813	$word = uc($word);
814	$bestform{$word} = $word;
815	$vocab[$nextsymbol] = $word;
816	$symbol{$word} = $nextsymbol;
817	$nextsymbol++;
818	}
819	$last_delimiter = $nextsymbol - 1;
820	# Stopwords
821	$first_stopword = $nextsymbol;
822
823	foreach my $word (sort keys %stopwords) {
824	# don't include stopword unless it occurs in the text
825	$word = lc($word);
826	next unless ($totalfreq{$word});
827	next if ($symbol{$word});
828
829	$vocab[$nextsymbol] = $word;
830	$symbol{$word} = $nextsymbol;
831	$nextsymbol++;
832	}
833	$last_stopword = $nextsymbol - 1;
834	$first_contentword = $nextsymbol;
835
836	# Thesaurus terms
837	if ($thesaurus) {
838	$first_thesaurusword = $nextsymbol;
839
840	foreach my $word (sort keys %thesaurus) {
841
842	$word = lc($word);
843	next if ($symbol{$word});
844	$bestform{$word} = $thesaurus{$word};
845
846	$vocab[$nextsymbol] = $word;
847	$symbol{$word} = $nextsymbol;
848	$nextsymbol++;
849
850	}
851	$last_thesaurusword = $nextsymbol - 1;
852	}
853
854	# Other content words
855	$first_extractword = $nextsymbol;
856
857	foreach my $word (sort (keys %bestform)) {
858
859	next if ($symbol{$word});
860
861	$vocab[$nextsymbol] = $word;
862	$symbol{$word} = $nextsymbol;
863	$nextsymbol++;
864	}
865	$last_extractword = $nextsymbol - 1;
866	$last_contentword = $nextsymbol - 1;
867
868	# Outut the words
869	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
870	open(VOC, ">$phinddir/clauses.vocab");
871
872	for (my $i = 1; $i < $nextsymbol; $i++) {
873	$w = $vocab[$i];
874
875	print VOC "$bestform{$w}\n";
876	$totalfreq{$w} = 0 unless ($totalfreq{$w});
877	}
878	close VOC;
879
880
881	# Create statistics file
882	# Output statistics about the vocablary
883	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
884	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
885
886	open(STAT, ">$phinddir/clauses.stats")
887	\|\| die "Cannot open $phinddir/clauses.stats: $!";
888
889	print STAT "first_delimiter $first_delimiter\n";
890	print STAT "last_delimiter $last_delimiter\n";
891	print STAT "first_stopword $first_stopword\n";
892	print STAT "last_stopword $last_stopword\n";
893	if ($thesaurus) {
894	print STAT "first_thesaurusword $first_thesaurusword\n";
895	print STAT "last_thesaurusword $last_thesaurusword\n";
896	}
897	print STAT "first_extractword $first_extractword\n";
898	print STAT "last_extractword $last_extractword\n";
899	print STAT "first_contentword $first_contentword\n";
900	print STAT "last_contentword $last_contentword\n";
901	print STAT "first_symbol $first_delimiter\n";
902	print STAT "last_symbol $last_contentword\n";
903	print STAT "first_word $first_stopword\n";
904	print STAT "last_word $last_contentword\n";
905	close STAT;
906
907	undef @vocab;
908
909
910	# Create numbers file
911	# Save text as symbol numbers
912	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
913
914	open(TXT, "<$phinddir/clauses");
915	open(NUM, ">$phinddir/clauses.numbers");
916
917	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
918	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
919	$phrasedelimiter = $symbol{$senlimit};
920	print NUM "$symbol{$colstart}\n";
921
922	# set up the special symbols that delimit documents and sentences
923	while(<TXT>) {
924
925	# split sentence into a list of tokens
926	$line = $_;
927	next unless ($line =~ /./);
928	@words = split(/\s+/, $line);
929
930	# output one token at a time
931	foreach $word (@words) {
932	# don't lower-case special delimiters - jrm21
933	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
934	$word = lc($word);
935	}
936	print NUM "$symbol{$word}\n";
937	}
938
939	# output phrase delimiter
940	print NUM "$phrasedelimiter\n";
941	}
942
943	close TXT;
944	# print NUM "$symbol{lc($colend)}\n";# jrm21
945	print NUM "$symbol{$colend}\n";
946	close NUM;
947
948	# Save thesaurus data in one convienient file
949	if ($thesaurus) {
950
951	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
952
953
954	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
955	if ($verbosity > 1);
956
957	# Read the thesaurus terms
958	my ($num, $text, %thes_symbols);
959
960	open(TH, "<$thesaurus_terms");
961	while(<TH>) {
962	chomp;
963	@words = split(/\s+/, $_);
964	$num = shift @words;
965	$text = "";
966
967	# translate words into symbol numbers
968	foreach $word (@words) {
969	$word = lc($word);
970	if ($symbol{$word}) {
971	$text .= "s$symbol{$word} ";
972	} elsif ($verbosity) {
973	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
974	}
975	}
976	$text =~ s/ $//;
977	$thes_symbols{$num} = $text;
978	}
979	close TH;
980
981	# Read the thesaurus links and write the corresponding data
982	open(TH, "<$thesaurus_links");
983	open(THOUT, ">$thesaurusfile");
984
985	while(<TH>) {
986	chomp;
987	($num, $text) = split(/:/, $_);
988
989	if (defined($thes_symbols{$num})) {
990	print THOUT "$num:$thes_symbols{$num}:$text\n";
991	} else {
992	print THOUT "$num:untranslated:$text\n";
993	}
994	}
995	close TH;
996	close THOUT;
997	}
998
999
1000
1001
1002	}
1003
1004
1005	# renumber_phrases
1006	#
1007	# Prepare the phrases file to be input to mgpp. The biggest problem is
1008	# reconciling the phrase identifiers used by the suffix program (which
1009	# we'll call suffix-id numbers) with the numbers used in the thesaurus
1010	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1011	# Phind-id numbers must be sorted by frequency of occurance.
1012	#
1013	# Start creating a set of phind-id numbers from the sorted suffix-id
1014	# numbers and (if required) the thesaurus-id numbers. Then add any other
1015	# phrases occuring in the thesaurus.
1016	#
1017	# The last thing we have to do is restore the vocabulary information to the
1018	# phrase file so that the phrases are stored as words, not as symbol
1019	# numbers.
1020
1021	# The original phrases file looks something like this:
1022	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1023	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1024	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1025	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1026
1027
1028	sub renumber_phrases {
1029	my ($self) = @_;
1030
1031	renumber_suffix_data($self);
1032	renumber_thesaurus_data($self);
1033	restore_vocabulary_data($self);
1034
1035	}
1036
1037
1038
1039	# renumber_suffix_data
1040	#
1041	# Translate phrases file to phrases.2 using phind keys instead
1042	# of suffix keys and sorting the expansion data.
1043
1044	sub renumber_suffix_data {
1045	my ($self) = @_;
1046
1047	my $verbosity = $self->{'verbosity'};
1048	my $out = $self->{'outhandle'};
1049	print $out "Translate phrases: suffix-ids become phind-id's\n"
1050	if ($verbosity);
1051
1052	my $phinddir = $self->{'phinddir'};
1053	my $infile = &util::filename_cat($phinddir, 'phrases');
1054	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1055
1056	# Read the phrase file. Calculate initial set of phind-id
1057	# numbers and store (suffixid -> frequency) relation.
1058
1059	my %suffixtophind;
1060	my @phindfrequency;
1061	my (@fields, $suffixid);
1062	my $nextphind = 1;
1063
1064	open(IN, "<$infile");
1065	while(<IN>) {
1066
1067	chomp;
1068	@fields = split(/:/, $_);
1069
1070	# get next suffixid and phindid
1071	$suffixid = shift @fields;
1072	$suffixtophind{$suffixid} = $nextphind;
1073
1074	# store total frequency
1075	shift @fields;
1076	$totalfrequency[$nextphind] = shift @fields;
1077
1078	$nextphind++;
1079	}
1080	close IN;
1081
1082
1083	# Translate phrases file to phrases.2. Use phind keys (not suffix
1084	# keys), sort expansion and document occurance data in order of
1085	# descending frequency..
1086	open(IN, "<$infile");
1087	open(OUT, ">$outfile");
1088
1089	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1090	my (@documwents, @newexp, $k, $n);
1091	my $linenumber = 0;
1092
1093	while(<IN>) {
1094
1095	# read the line
1096	chomp;
1097	@fields = split(/:/, $_);
1098
1099	# get a phrase number for this line
1100	$suffixid = shift @fields;
1101	die unless (defined($suffixtophind{$suffixid}));
1102	$phindid = $suffixtophind{$suffixid};
1103
1104	# get the symbols in the phrase
1105	$text = shift @fields;
1106
1107	# output status information
1108	$linenumber++;
1109	if ($verbosity > 2) {
1110	if ($linenumber % 1000 == 0) {
1111	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1112	}
1113	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1114	}
1115
1116	# get the phrase frequency
1117	$tf = shift @fields;
1118
1119	# get the number of expansions
1120	$countexp = shift @fields;
1121
1122	# get the expansions, convert them into phind-id numbers, and sort them
1123	$expansions = shift @fields;
1124	@newexp = ();
1125	foreach $k (split(/,/, $expansions)) {
1126	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1127	$n = $suffixtophind{$k};
1128	push @newexp, $n;
1129	}
1130	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1131
1132	# get the number of documents
1133	$countdocs = shift @fields;
1134
1135	# get the documents and sort them
1136	$documents = shift @fields;
1137	$documents =~ s/d//g;
1138	@documents = split(/;/, $documents);
1139	@documents = sort by_doc_frequency @documents;
1140
1141	# output the phrase data
1142	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1143	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1144
1145	}
1146
1147	close IN;
1148	close OUT;
1149	}
1150
1151
1152	# renumber_thesaurus_data
1153	#
1154	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1155
1156	sub renumber_thesaurus_data {
1157	my ($self) = @_;
1158
1159	my $out = $self->{'outhandle'};
1160	my $verbosity = $self->{'verbosity'};
1161	my $thesaurus = $self->{'thesaurus'};
1162
1163	my $phinddir = $self->{'phinddir'};
1164	my $infile = &util::filename_cat($phinddir, "phrases.2");
1165	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1166
1167
1168	# If no thesaurus is defined, simply move the phrases file.
1169	if (!$thesaurus) {
1170	print $out "Translate phrases.2: no thesaurus data\n"
1171	if ($verbosity);
1172	&util::mv($infile, $outfile);
1173	return;
1174	}
1175
1176	print $out "Translate phrases.2: add thesaurus data\n"
1177	if ($verbosity);
1178
1179	# 1.
1180	# Read thesaurus file and store (symbols->thesaurusid) mapping
1181	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1182	my %symbolstothesid;
1183	my (@fields, $thesid, $symbols);
1184
1185	open(TH, "<$thesaurusfile");
1186
1187	while (<TH>) {
1188
1189	chomp;
1190	@fields = split(/:/, $_);
1191
1192	# get id and text
1193	$thesid = shift @fields;
1194	$symbols = shift @fields;
1195	$symbolstothesid{$symbols} = $thesid;
1196	}
1197	close TH;
1198
1199	# 2.
1200	# Read phrases file to find thesaurus entries that already
1201	# have a phindid. Store their phind-ids for later translation,
1202	# and store their frequency for later sorting.
1203	my %thesaurustophindid;
1204	my %phindidtofrequency;
1205	my ($phindid, $freq);
1206
1207	open(IN, "<$infile");
1208
1209	while(<IN>) {
1210
1211	chomp;
1212	@fields = split(/:/, $_);
1213
1214	# phindid and symbols for this line
1215	$phindid = shift @fields;
1216	$symbols = shift @fields;
1217	$freq = shift @fields;
1218
1219	# do we have a thesaurus id corresponding to this phrase?
1220	if (defined($symbolstothesid{$symbols})) {
1221	$thesid = $symbolstothesid{$symbols};
1222	$thesaurustophindid{$thesid} = $phindid;
1223	$phindidtofrequency{$phindid} = $freq;
1224	}
1225	}
1226	close IN;
1227
1228	undef %symbolstothesid;
1229
1230	# 3.
1231	# Create phind-id numbers for remaining thesaurus entries,
1232	# and note that their frequency is 0 for later sorting.
1233	my $nextphindid = $phindid + 1;
1234
1235	open(TH, "<$thesaurusfile");
1236	while(<TH>) {
1237
1238	chomp;
1239	@fields = split(/:/, $_);
1240
1241	# read thesaurus-id and ensure it has a corresponding phind-id
1242	$thesid = shift @fields;
1243	if (!defined($thesaurustophindid{$thesid})) {
1244	$thesaurustophindid{$thesid} = $nextphindid;
1245	$phindidtofrequency{$nextphindid} = 0;
1246	$nextphindid++;
1247	}
1248	}
1249	close TH;
1250
1251	# 4.
1252	# Translate thesaurus file, replacing thesaurus-id numbers with
1253	# phind-id numbers.
1254	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1255	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1256	my (@links, $linkid, %linkidtotype, $newrelation);
1257
1258	open(TH, "<$thesaurusfile");
1259	open(TO, ">$newthesaurusfile");
1260	while(<TH>) {
1261
1262	chomp;
1263	@fields = split(/:/, $_);
1264
1265	# phindid and symbols for this line
1266	($thesid, $symbols, $relations) = @fields;
1267
1268	die unless ($thesid && $symbols);
1269	die unless $thesaurustophindid{$thesid};
1270	$phindid = $thesaurustophindid{$thesid};
1271
1272	# convert each part of the relation string to use phind-id numbers
1273	# at the same time, we want to sort the list by frequency.
1274	undef %linkidtotype;
1275
1276	foreach $linktext (split(/;/, $relations)) {
1277	@linkdata = split(/,/, $linktext);
1278
1279	# remember the linktype (e.g. BT, NT)
1280	$linktype = shift @linkdata;
1281
1282	# store the type of each link
1283	foreach $thesid (@linkdata) {
1284	die unless (defined($thesaurustophindid{$thesid}));
1285	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1286	}
1287	}
1288
1289	# sort the list of links, first by frequency, then by type.
1290	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1291	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1292	$linkcounter = (scalar @links);
1293
1294	# create a string describing the link information
1295	$linktype = $linkidtotype{$links[0]};
1296	$newrelation = $linktype;
1297	foreach $linkid (@links) {
1298	if ($linkidtotype{$linkid} ne $linktype) {
1299	$linktype = $linkidtotype{$linkid};
1300	$newrelation .= ";" . $linktype;
1301	}
1302	$newrelation .= "," . $linkid;
1303	}
1304	$newrelation .= ";";
1305
1306
1307	# output the new line
1308	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1309	}
1310	close TH;
1311	close TO;
1312
1313	undef %thesaurustophindid;
1314	undef %linkidtotype;
1315	undef %phindidtofrequency;
1316
1317	# 5.
1318	# Read thesaurus data (in phind-id format) into memory
1319	my %thesaurusdata;
1320
1321	open(TH, "<$newthesaurusfile");
1322	while(<TH>) {
1323	chomp;
1324	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1325	die unless ($phindid && $symbols);
1326	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1327	}
1328	close TH;
1329
1330	# 6.
1331	# Add thesaurus data to phrases file
1332	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1333	my (@documwents, @newexp, $k, $n);
1334	my $linenumber = 0;
1335
1336	open(IN, "<$infile");
1337	open(OUT, ">$outfile");
1338
1339	# Update existing phrases
1340	while(<IN>) {
1341
1342	chomp;
1343	@fields = split(/:/, $_);
1344
1345	# get data for this line
1346	$phindid = shift @fields;
1347
1348	# output the phrase data, with thesaurus information
1349	print OUT "$phindid:", join(":", @fields);
1350
1351	# add thesaurus data
1352	if (defined($thesaurusdata{$phindid})) {
1353	@fields = split(/:/, $thesaurusdata{$phindid});
1354	shift @fields;
1355	$linkcounter = shift @fields;
1356	$relations = shift @fields;
1357
1358	print OUT ":$linkcounter:$relations";
1359	$thesaurusdata{$phindid} = "";
1360	}
1361	print OUT "\n";
1362	}
1363	close IN;
1364
1365	# Add phrases that aren't already in the file
1366	foreach $phindid (sort numerically keys %thesaurusdata) {
1367	next unless ($thesaurusdata{$phindid});
1368
1369	@fields = split(/:/, $thesaurusdata{$phindid});
1370	$symbols = shift @fields;
1371	$linkcounter = shift @fields;
1372	$relations = shift @fields;
1373
1374	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1375	}
1376	close OUT;
1377
1378	}
1379
1380	# restore_vocabulary_data
1381	#
1382	# Read phrases.3 and restore vocabulary information. Then write
1383	# this data to the MGPP input files (pword.txt and pdata.txt) and
1384	# (if requested) to the saved phrases file.
1385
1386	sub restore_vocabulary_data {
1387	my ($self) = @_;
1388
1389	my $out = $self->{'outhandle'};
1390	my $verbosity = $self->{'verbosity'};
1391	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1392
1393	my $phinddir = $self->{'phinddir'};
1394	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1395	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1396	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1397	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1398
1399	my $savephrases = $self->{'savephrases'};
1400
1401	# 1.
1402	# Read the vocabulary file
1403	open(V, "<$vocabfile")
1404	\|\| die "Cannot open $vocabfile: $!";
1405	my @symbol;
1406	my $i = 1;
1407	while(<V>) {
1408	chomp;
1409	$symbol[$i++] = $_;
1410	}
1411	close V;
1412
1413	# 2.
1414	# Translate phrases.3 to MGPP input files
1415	my ($key, $text, $word, $isThesaurus);
1416	my @fields;
1417	my $linenumber = 0;
1418
1419	open(IN, "<$infile");
1420	open(DATA, ">$datafile");
1421	open(WORD, ">$wordfile");
1422
1423	# Save the phrases in a separate text file
1424	if ($savephrases) {
1425	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1426	open(SAVE, ">$savephrases");
1427	}
1428
1429	while(<IN>) {
1430
1431	# read the line
1432	chomp;
1433	$line = $_;
1434	@fields = split(/:/, $line);
1435
1436	# get a phrase number for this line
1437	$key = shift @fields;
1438
1439	# restore the text of the phrase
1440	$text = shift @fields;
1441	$text =~ s/s(\d+)/$symbol[$1]/g;
1442	if ($text =~ / /) {
1443	$word = "";
1444	} elsif ($text ne 'untranslated') {
1445	$word = $text;
1446	}
1447
1448	# output the phrase data
1449	print DATA "<Document>";
1450	print DATA "$key:$text:", join(":", @fields), ":\n";
1451
1452	# output the word index search data
1453	print WORD "<Document>$word\n";
1454
1455	# output the phrases to a text file
1456	if ($savephrases) {
1457	if ((scalar @fields) == 7) {
1458	$isThesaurus = 1;
1459	} else {
1460	$isThesaurus = 0;
1461	}
1462	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1463	}
1464	}
1465	close IN;
1466	close WORD;
1467	close DATA;
1468	close SAVE if ($savephrases);
1469
1470	}
1471
1472
1473
1474	# sort routines used to renumber phrases
1475
1476	sub numerically { $a <=> $b }
1477
1478	sub by_doc_frequency {
1479	my $fa = 1;
1480	if ($a =~ /,/) {
1481	$fa = $a;
1482	$fa =~ s/\d+,//;
1483	}
1484	my $fb = 1;
1485	if ($b =~ /,/) {
1486	$fb = $b;
1487	$fb =~ s/\d+,//;
1488	}
1489
1490	return ($fb <=> $fa);
1491	}
1492
1493	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: