Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 2503

Last change on this file since 2503 was 2500, checked in by sjboddie, 23 years ago
Removed test for phindcgi from phind classifier as it is no longer used
Property svn:keywords set to `Author Date Id Revision`
File size: 38.6 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl phind" at the command line for a summary.
32
33	package phind;
34
35	use BasClas;
36	use util;
37	use ghtml;
38	use unicode;
39
40	sub BEGIN {
41	@ISA = ('BasClas');
42	}
43
44
45	sub print_usage {
46	print STDERR "
47	usage: classify phind [options]
48
49	options:
50	-text Fields The text used to build the phrase hierarchy.
51	(default: 'section:Title,section:text')
52
53	-title Title The metadata field used to describe each document.
54	(default: 'Title')
55
56	-button Name The label for the classifier screen and button in
57	navigation bar.
58	(default: 'Phrase')
59
60	-language Regex Language or languages to use building hierarchy.
61	Languages are identified by two-letter country codes
62	like en (English), es (Spanish), and fr (French).
63	Language is a regular expression, so 'en\|fr' (English or
64	French) and '..' (match any language) are valid.
65	(default: 'en'.)
66
67	-savephrases File If set, the phrase infomation will be stored in
68	the given file as text. It is probably a good idea
69	to use an absolute path.
70	(defualt: not set)
71
72	-suffixmode N The smode parameter to the phrase extraction program. A
73	value of 0 means that stopwords are ignored, and of 1
74	means that stopwords are used.
75	(default: 1)
76
77	-thesaurus Name Name of a thesaurus stored in phind format in the
78	collection's etc directory.
79	(default: not set)
80
81	-untidy Don't remove working files.
82
83	"; }
84
85
86	%wanted_index_files = ('td'=>1,
87	't'=>1,
88	'ti'=>1,
89	'tl'=>1,
90	'tsd'=>1,
91	'idb'=>1,
92	'ib1'=>1,
93	'ib2'=>1,
94	'ib3'=>1,
95	'i'=>1,
96	'il'=>1,
97	'w'=>1,
98	'wa'=>1);
99
100
101
102	# Phrase delimiter symbols - these should be abstracted out someplace
103
104	my $colstart = "COLLECTIONSTART";
105	my $colend = "COLLECTIONEND";
106	my $doclimit = "DOCUMENTLIMIT";
107	my $senlimit = "SENTENCELIMIT";
108	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
109
110
111	# Create a new phind browser based on collect.cfg
112
113	sub new {
114	my $class = shift (@_);
115	my $self = new BasClas($class, @_);
116
117	my $out = $self->{'outhandle'};
118
119
120	# Phind installation check
121	# The phind phrase browser is research software and is not installed
122	# by defualt. If the user attepts to use it we warn them that it's a
123	# bit dodgy, then tell them how to install it. If they can do that
124	# and get all the files in place, then we let them proceed.
125
126	print $out "Checking Phind phrase browser requirements...\n";
127
128	# Ensure the Phind generate scripts are in place
129	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
130	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
131	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
132
133	if (!(-e $file1)) {
134	print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
135	exit(1);
136	}
137
138	# Ensure the Phind Java applet is in place
139	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
140	$file1 = &util::filename_cat($src, "Phind.class");
141
142	if (!(-e $file1)) {
143	print STDERR "The phind Java classes are not compiled. ";
144	print STDERR "To compile them, change to the directory\n";
145	print STDERR " $src\n";
146	print STDERR "and use your Java compiler to compile Phind.java.\n";
147	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
148	exit(1);
149	}
150
151	# Parse classifier arguments
152	my $builddir = "";
153	if (!parsargv::parse(\@_,
154	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155	q^title/.*/Title^, \$self->{'titlefield'},
156	q^button/.*/Phrase^, \$self->{'buttonname'},
157	q^language/.*/en^, \$language,
158	q^builddir/.*/^, \$builddir,
159	q^savephrases/\d/0^, \$self->{'savephrases'},
160	q^suffixmode/\d/1^, \$self->{'suffixmode'},
161	q^thesaurus/.*/^, \$self->{'thesaurus'},
162	q^untidy^, \$self->{'untidy'},
163	"allow_extra_options")) {
164
165	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166	&print_usage();
167	die "\n";
168	}
169
170	# classifier information
171	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173	# limit languages
174	$self->{'language_exp'} = $language;
175
176	# collection directories
177	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178	if (!$builddir) {
179	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180	}
181	$self->{'builddir'} = $builddir;
182
183	$self->{'total'} = 0;
184
185	return bless $self, $class;
186	}
187
188
189	# Initialise the phind classifier
190
191	sub init {
192	my $self = shift (@_);
193
194	# ensure we have a build directory
195	my $builddir = $self->{'builddir'};
196	die unless (-e "$builddir");
197
198	# create phind directory
199	my $phnumber = 1;
200	my $phinddir = &util::filename_cat($builddir, "phind1");
201	while (-e "$phinddir") {
202	$phnumber++;
203	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
204	}
205	&util::mk_dir("$phinddir");
206	$self->{'phinddir'} = $phinddir;
207	$self->{'phindnumber'} = $phnumber;
208
209	# open filehandles for documents and text
210	my $clausefile = &util::filename_cat("$phinddir", "clauses");
211	&util::rm($clausefile) if (-e $clausefile);
212
213	my $txthandle = 'TEXT' . $phnumber;
214	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
215	$self->{'txthandle'} = $txthandle;
216
217	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
218	&util::rm($docfile) if (-e $docfile);
219
220	my $dochandle = 'DOC' . $phnumber;
221	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
222	$self->{'dochandle'} = $dochandle;
223
224	}
225
226
227	# Classify each document.
228	#
229	# Each document is passed here in turn. The classifier extracts the
230	# text of each and stores it in the clauses file. Document details are
231	# stored in the docs.txt file.
232
233	sub classify {
234	my $self = shift (@_);
235	my $doc_obj = shift @_;
236
237	my $verbosity = $self->{'verbosity'};
238	my $top_section = $doc_obj->get_top_section();
239
240	my $titlefield = $self->{'titlefield'};
241
242	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
243	print "process: $title\n" if ($verbosity > 2);
244
245	# Only consider the file if it is in the correct language
246	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
247	my $phrlanguage = $self->{'language_exp'};
248	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
249
250	# record this file
251	$self->{'total'} ++;
252	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
253
254
255	# Store document details
256	my $OID = $doc_obj->get_OID();
257	$OID = "NULL" unless defined $OID;
258	my $dochandle = $self->{'dochandle'};
259	print $dochandle "<Document>\t$OID\t$title\n";
260
261	# Store the text occuring in this object
262
263	# output the document delimiter
264	my $txthandle = $self->{'txthandle'};
265	print $txthandle "$doclimit\n";
266
267	# iterarate over the required indexes and store their text
268	my $indexes = $self->{'indexes'};
269	my $text = "";
270	my ($part, $level, $field, $section, $data, $dataref);
271
272	foreach $part (split(/,/, $indexes)) {
273
274	# Each field has a level and a data element ((e.g. document:Title)
275	($level, $field) = split(/:/, $part);
276	die unless ($level && $field);
277
278	# Extract the text from every section
279	# (In phind, document:text and section:text are equivalent)
280	if ($field eq "text") {
281	$data = "";
282	$section = $doc_obj->get_top_section();
283	while (defined($section)) {
284	$data .= $doc_obj->get_text($section) . "\n";
285	$section = $doc_obj->get_next_section($section);
286	}
287	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
288	}
289
290	# Extract a metadata field from a document
291	# (If ther eis more than one element of the given type, get them all.)
292	elsif ($level eq "document") {
293	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
294	foreach $data (@$dataref) {
295	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
296	}
297	}
298
299	# Extract metadata from every section in a document
300	elsif ($level eq "section") {
301	$data = "";
302	$section = $doc_obj->get_top_section();
303	while (defined($section)) {
304	$dataref = $doc_obj->get_metadata($section, $field);
305	$data .= join("\n", @$dataref) . "\n";
306	$section = $doc_obj->get_next_section($section);
307	}
308	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
309	}
310
311	# Some sort of specification which I don't understand
312	else {
313	die "Unknown level ($level) in phind index ($part)\n";
314	}
315
316	}
317
318	# output the text
319	$text =~ tr/\n//s;
320	print $txthandle "$text";
321	}
322
323
324	# Construct the classifier from the information already gathered
325	#
326	# When get_classify_info is called, the clauses and docs.txt files have
327	# already been constructed in the phind directory. This function will
328	# translate them into compressed, indexed MGPP files that can be read by
329	# the phindcgi script. It will also register our classifier so that it
330	# shows up in the navigation bar.
331
332	sub get_classify_info {
333	my $self = shift (@_);
334
335	close $self->{'dochandle'};
336	close $self->{'txthandle'};
337	my $verbosity = $self->{'verbosity'};
338	my $out = $self->{'outhandle'};
339	my $phinddir = $self->{'phinddir'};
340
341	my $osextra = "";
342	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
343	$osextra = " -d /";
344	}
345
346	if ($verbosity) {
347	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
348	print $out "*** in", $self->{'phinddir'}, "\n";
349	}
350
351	# Construct phind indexes
352	my $suffixmode = $self->{'suffixmode'};
353	my ($command, $status);
354
355	# Generate the vocabulary, symbol statistics, and numbers file
356	# from the clauses file
357	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
358	&extract_vocabulary($self);
359
360	# Use the suffix program to generate the phind/phrases file
361	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
362	&execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
363
364	# Create the phrase file and put phrase numbers in phind/phrases
365	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
366	&renumber_phrases($self);
367
368	print $out "\nCreating phrase databases\n";
369	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
370	my $mg_stem = &util::filename_cat($phinddir, "pdata");
371
372	&execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
373	&execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
374	&execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
375
376	# create the mg index of words
377	print $out "\nCreating word-level search indexes\n";
378	$mg_input = &util::filename_cat($phinddir, "pword.txt");
379	$mg_stem = &util::filename_cat($phinddir, "pword");
380
381	&execute("mgpp_passes $osextra -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
382	&execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
383	&execute("mgpp_perf_hash_build $osextra -f $mg_stem", $verbosity, $out);
384	&execute("mgpp_passes $osextra -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
385	&execute("mgpp_weights_build $osextra -f $mg_stem", $verbosity, $out);
386	&execute("mgpp_invf_dict $osextra -f $mg_stem", $verbosity, $out);
387
388	&execute("mgpp_stem_idx $osextra -f $mg_stem -s 1", $verbosity, $out);
389	&execute("mgpp_stem_idx $osextra -f $mg_stem -s 2", $verbosity, $out);
390	&execute("mgpp_stem_idx $osextra -f $mg_stem -s 3", $verbosity, $out);
391
392	# create the mg document information database
393	print $out "\nCreating document information databases\n";
394	$mg_input = &util::filename_cat($phinddir, "docs.txt");
395	$mg_stem = &util::filename_cat($phinddir, "docs");
396
397	&execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
398	&execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
399	&execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
400
401	# Tidy up stray files
402	if (!$self->{'untidy'}) {
403	print $out "\nCleaning up\n" if ($verbosity > 2);
404	opendir (DIR, $phinddir) \|\| die;
405	my @files = readdir DIR;
406	closedir DIR;
407
408	foreach $file (@files) {
409	next if $file =~ /^\.\.?$/;
410	my ($suffix) = $file =~ /\.([^\.]+)$/;
411	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
412	# delete it!
413	print $out "deleting $file\n"; # if $verbosity > 2;
414	&util::rm (&util::filename_cat ($phinddir, $file));
415	}
416	}
417	}
418
419	# Return the information about the classifier that we'll later want to
420	# use to create macros when the Phind classifier document is displayed.
421	my %classifyinfo = ('thistype'=>'Invisible',
422	'childtype'=>'Phind',
423	'Title'=>$self->{'buttonname'},
424	'parameters'=>"phindnumber=$self->{'phindnumber'}",
425	'contains'=>[]);
426
427	my $collection = $self->{'collection'};
428	my $url = "library?a=p&p=phind&c=$collection";
429	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
430
431	return \%classifyinfo;
432	}
433
434
435
436	sub convert_gml_to_tokens {
437
438	my ($language_exp, $text) = @_;
439
440	if ($language_exp =~ /en/) {
441	return &convert_gml_to_tokens_EN($text);
442	}
443
444	$_ = $text;
445
446	# 1. remove GML tags
447
448	# Remove everything that is in a tag
449	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
450	s/\s<br>\s/ LINEBREAK /isgo;
451	s/<[^>]*>/ /sgo;
452
453	# Now we have the text, but it may contain HTML
454	# elements coded as > etc. Remove these tags.
455	s/&/&/sgo;
456	s/</</sgo;
457	s/>/>/sgo;
458	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
459	s/\s<br>\s/ LINEBREAK /isgo;
460	s/<[^>]*>/ /sgo;
461
462	# replace<p> and <br> placeholders with clause break symbol (\n)
463	s/\s+/ /gso;
464	s/PARAGRAPHBREAK/\n/sgo;
465	s/LINEBREAK/\n/sgo;
466
467
468
469
470	# 2. Split the remaining text into space-delimited tokens
471
472	# Convert any HTML special characters (like ") to their UTF8 equivalent
473	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
474
475	# Split text at word boundaries
476	s/\b/ /go;
477
478	# 3. Convert the remaining text to "clause format"
479
480	# Insert newline if the end of a sentence is detected
481	# (delimter is: "[\.\?\!]\s")
482	# s/\s*[\.\?\!]\s+/\n/go;
483
484	# remove unnecessary punctuation and replace with clause break symbol (\n)
485	s/[^\w ]/\n/go;
486
487	# remove extraneous whitespace
488	s/ +/ /sgo;
489	s/^\s+//mgo;
490	s/\s*$/\n/mgo;
491
492	# remove lines that contain one word or less
493	s/^\S*$//mgo;
494	s/^\s*$//mgo;
495	tr/\n//s;
496
497	return $_;
498	}
499
500	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
501
502	sub convert_gml_to_tokens_EN {
503	$_ = shift @_;
504
505	# FIRST, remove GML tags
506
507	# Replace all whitespace with a simple space
508	s/\s+/ /gs;
509
510	# Remove everything that is in a tag
511	s/\s<p>\s/ PARAGRAPHBREAK /isg;
512	s/\s<br>\s/ LINEBREAK /isg;
513	s/<[^>]*>/ /sg;
514
515	# Now we have the text, but it may contain HTML
516	# elements coded as > etc. Remove these tags.
517	s/</</sg;
518	s/>/>/sg;
519
520	s/\s+/ /sg;
521	s/\s<p>\s/ PARAGRAPHBREAK /isg;
522	s/\s<br>\s/ LINEBREAK /isg;
523	s/<[^>]*>/ /sg;
524
525	# remove & and other miscellaneous markup tags
526	s/&/&/sg;
527	s/</</sg;
528	s/>/>/sg;
529	s/&/&/sg;
530
531	# replace<p> and <br> placeholders with carriage returns
532	s/PARAGRAPHBREAK/\n/sg;
533	s/LINEBREAK/\n/sg;
534
535
536	# Exceptional punctuation
537	#
538	# We make special cases of some punctuation
539
540	# remove any apostrophe that indicates omitted letters
541	s/(\w+)\'(\w*\s)/ $1$2 /g;
542
543	# remove period that appears in a person's initals
544	s/\s([A-Z])\./ $1 /g;
545
546	# replace hyphens in hypheanted words and names with a space
547	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
548
549	# Convert the remaining text to "clause format",
550	# This means removing all excess punctuation and garbage text,
551	# normalising valid punctuation to fullstops and commas,
552	# then putting one cluse on each line.
553
554	# Insert newline when the end of a sentence is detected
555	# (delimter is: "[\.\?\!]\s")
556	s/\s*[\.\?\!]\s+/\n/g;
557
558	# split numbers after four digits
559	s/(\d\d\d\d)/$1 /g;
560
561	# split words after 32 characters
562
563	# squash repeated punctuation
564	tr/A-Za-z0-9 //cs;
565
566	# save email addresses
567	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
568
569	# normalise clause breaks (mostly punctuation symbols) to commas
570	s/[^A-Za-z0-9 \n]+/ , /g;
571
572	# Remove repeated commas, and replace with newline
573	s/\s*,[, ]+/\n/g;
574
575	# remove extra whitespace
576	s/ +/ /sg;
577	s/^\s+//mg;
578	s/\s*$/\n/mg;
579
580	# remove lines that contain one word or less
581	s/^\w*$//mg;
582	s/^\s*$//mg;
583	tr/\n//s;
584
585	return $_;
586
587	}
588
589
590
591	# Execute a system command
592
593	sub execute {
594	my ($command, $verbosity, $outhandle) = @_;
595	print $outhandle "Executing: $command\n" if ($verbosity > 2);
596	$! = 0;
597	my $status = system($command);
598	if ($status != 0) {
599	print STDERR "phind - Error executing '$command': $!\n";
600	exit($status);
601	}
602	}
603
604
605	# Generate the vocabulary, symbol statistics, and numbers file from the
606	# clauses file. This is legacy code, so is a bit messy and probably wont
607	# run under windows.
608
609	sub extract_vocabulary {
610	my ($self) = @_;
611
612	my $verbosity = $self->{'verbosity'};
613	my $out = $self->{'outhandle'};
614
615	my $collectiondir = $self->{'collectiondir'};
616	my $phinddir = $self->{'phinddir'};
617
618	my $language_exp = $self->{'language_exp'};
619
620	my ($w, $l, $line, $word);
621
622	my ($first_delimiter, $last_delimiter,
623	$first_stopword, $last_stopword,
624	$first_extractword, $last_extractword,
625	$first_contentword, $last_contentword,
626	$phrasedelimiter);
627
628	my $thesaurus = $self->{'thesaurus'};
629	my ($thesaurus_links, $thesaurus_terms,
630	%thesaurus, $first_thesaurusword, $last_thesaurusword);
631
632	my %symbol;
633	my (%freq);
634
635	print $out "Calculating vocabulary\n" if ($verbosity > 1);
636
637	# Read and store the stopwords
638	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
639	my $stopword_files = ();
640	my ($language, $language_dir, $file, $file_name);
641	my %stopwords;
642
643	# Examine each directory in the stopword directory
644	opendir(STOPDIR, $stopdir);
645	foreach $language (readdir STOPDIR) {
646
647	# Ignore entries that do not match the classifier's language
648	next unless ($language =~ /$language_exp/);
649	$language_dir = &util::filename_cat($stopdir, $language);
650	next unless (-d "$language_dir");
651
652	opendir(LANGDIR, $language_dir);
653	foreach $file (readdir LANGDIR) {
654
655	# Ignore entries that are not stopword files
656	next unless ($file =~ /sw$/);
657	$file_name = &util::filename_cat($language_dir, $file);
658	next unless (-f "$file_name");
659
660	# Read the stopwords
661	open(STOPFILE, "<$file_name");
662	while (<STOPFILE>) {
663	s/^\s+//;
664	s/\s.*//;
665	$word = $_;
666	$l = lc($word);
667	$stopwords{$l} = $word;
668	}
669	close STOPFILE;
670
671	}
672	closedir LANGDIR;
673	}
674	closedir STOPDIR;
675
676	# Read thesaurus information
677	if ($thesaurus) {
678
679	# link file exists
680	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
681	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
682
683	# ensure term file exists in the correct language
684	if ($language_exp =~ /^([a-z][a-z])/) {
685	$language = $1;
686	} else {
687	$language = 'en';
688	}
689	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
690	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
691
692
693	# Read the thesaurus terms
694	open(TH, "<$thesaurus_terms");
695	while(<TH>) {
696	s/^\d+ //;
697	s/\(.*\)//;
698	foreach $w (split(/\s+/, $_)) {
699	$thesaurus{lc($w)} = $w;
700	}
701	}
702	close TH;
703	}
704
705	# Read words in the text and count occurences
706	open(TXT, "<$phinddir/clauses");
707	my @words;
708
709	while(<TXT>) {
710	$line = $_;
711	next unless ($line =~ /./);
712
713	@words = split(/\s+/, $line);
714	foreach $w (@words) {
715	$l = lc($w);
716	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
717	$freq{$w}++;
718	}
719	$freq{$senlimit}++;
720	}
721
722	close TXT;
723
724	# Calculate the "best" form of each word
725	my (%bestform, %totalfreq, %bestfreq);
726
727	foreach $w (sort (keys %freq)) {
728	$l = lc($w);
729
730	# totalfreq is the number of times a term appears in any form
731	$totalfreq{$l} += $freq{$w};
732
733	if (defined $stopwords{$l}) {
734	$bestform{$l} = $stopwords{$l};
735
736	} elsif (defined $thesaurus{$l}) {
737	$bestform{$l} = $thesaurus{$l};
738
739	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
740	$bestfreq{$l} = $freq{$w};
741	$bestform{$l} = $w;
742	}
743	}
744
745	undef %freq;
746	undef %bestfreq;
747
748
749	# Assign symbol numbers to tokens
750	my $nextsymbol = 1;
751	my (@vocab);
752
753	# Delimiters
754	$first_delimiter = 1;
755
756	foreach $word (@delimiters) {
757
758	$word = lc($word);
759	$bestform{$word} = uc($word);
760	$vocab[$nextsymbol] = $word;
761	$symbol{$word} = $nextsymbol;
762	$nextsymbol++;
763	}
764	$last_delimiter = $nextsymbol - 1;
765
766	# Stopwords
767	$first_stopword = $nextsymbol;
768
769	foreach my $word (sort keys %stopwords) {
770
771	# don't incluse stopword unless it occurs in the text
772	$word = lc($word);
773	next unless ($totalfreq{$word});
774	next if ($symbol{$word});
775
776	$vocab[$nextsymbol] = $word;
777	$symbol{$word} = $nextsymbol;
778	$nextsymbol++;
779	}
780	$last_stopword = $nextsymbol - 1;
781	$first_contentword = $nextsymbol;
782
783	# Thesaurus terms
784	if ($thesaurus) {
785	$first_thesaurusword = $nextsymbol;
786
787	foreach my $word (sort keys %thesaurus) {
788
789	$word = lc($word);
790	next if ($symbol{$word});
791	$bestform{$word} = $thesaurus{$word};
792
793	$vocab[$nextsymbol] = $word;
794	$symbol{$word} = $nextsymbol;
795	$nextsymbol++;
796
797	}
798	$last_thesaurusword = $nextsymbol - 1;
799	}
800
801	# Other content words
802	$first_extractword = $nextsymbol;
803
804	foreach my $word (sort (keys %bestform)) {
805
806	next if ($symbol{$word});
807
808	$vocab[$nextsymbol] = $word;
809	$symbol{$word} = $nextsymbol;
810	$nextsymbol++;
811	}
812	$last_extractword = $nextsymbol - 1;
813	$last_contentword = $nextsymbol - 1;
814
815
816	# Outut the words
817	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
818	open(VOC, ">$phinddir/clauses.vocab");
819
820	for (my $i = 1; $i < $nextsymbol; $i++) {
821	$w = $vocab[$i];
822
823	print VOC "$bestform{$w}\n";
824	$totalfreq{$w} = 0 unless ($totalfreq{$w});
825	}
826	close VOC;
827
828
829	# Create statistics file
830	# Output statistics about the vocablary
831	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
832	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
833
834	open(STAT, ">$phinddir/clauses.stats")
835	\|\| die "Cannot open $phinddir/clauses.stats: $!";
836
837	print STAT "first_delimiter $first_delimiter\n";
838	print STAT "last_delimiter $last_delimiter\n";
839	print STAT "first_stopword $first_stopword\n";
840	print STAT "last_stopword $last_stopword\n";
841	if ($thesaurus) {
842	print STAT "first_thesaurusword $first_thesaurusword\n";
843	print STAT "last_thesaurusword $last_thesaurusword\n";
844	}
845	print STAT "first_extractword $first_extractword\n";
846	print STAT "last_extractword $last_extractword\n";
847	print STAT "first_contentword $first_contentword\n";
848	print STAT "last_contentword $last_contentword\n";
849	print STAT "first_symbol $first_delimiter\n";
850	print STAT "last_symbol $last_contentword\n";
851	print STAT "first_word $first_stopword\n";
852	print STAT "last_word $last_contentword\n";
853	close STAT;
854
855	undef @vocab;
856
857
858	# Create numbers file
859	# Save text as symbol numbers
860	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
861
862	open(TXT, "<$phinddir/clauses");
863	open(NUM, ">$phinddir/clauses.numbers");
864
865	$phrasedelimiter = $symbol{lc($senlimit)};
866	print NUM "$symbol{lc($colstart)}\n";
867
868	# set up the special symbols that delimit documents and sentences
869	while(<TXT>) {
870
871	# split sentence into a list of tokens
872	$line = $_;
873	next unless ($line =~ /./);
874	@words = split(/\s+/, $line);
875
876	# output one token at a time
877	foreach $word (@words) {
878	$word = lc($word);
879	print NUM "$symbol{$word}\n";
880	}
881
882	# output phrase delimiter
883	print NUM "$phrasedelimiter\n";
884	}
885
886	close TXT;
887	print NUM "$symbol{lc($colend)}\n";
888	close NUM;
889
890	# Save thesaurus data in one convienient file
891	if ($thesaurus) {
892
893	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
894
895
896	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
897	if ($verbosity > 1);
898
899	# Read the thesaurus terms
900	my ($num, $text, %thes_symbols);
901
902	open(TH, "<$thesaurus_terms");
903	while(<TH>) {
904	chomp;
905	@words = split(/\s+/, $_);
906	$num = shift @words;
907	$text = "";
908
909	# translate words into symbol numbers
910	foreach $word (@words) {
911	$word = lc($word);
912	if ($symbol{$word}) {
913	$text .= "s$symbol{$word} ";
914	} elsif ($verbosity) {
915	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
916	}
917	}
918	$text =~ s/ $//;
919	$thes_symbols{$num} = $text;
920	}
921	close TH;
922
923	# Read the thesaurus links and write the corresponding data
924	open(TH, "<$thesaurus_links");
925	open(THOUT, ">$thesaurusfile");
926
927	while(<TH>) {
928	chomp;
929	($num, $text) = split(/:/, $_);
930
931	if (defined($thes_symbols{$num})) {
932	print THOUT "$num:$thes_symbols{$num}:$text\n";
933	} else {
934	print THOUT "$num:untranslated:$text\n";
935	}
936	}
937	close TH;
938	close THOUT;
939	}
940
941
942
943
944	}
945
946
947	# renumber_phrases
948	#
949	# Prepare the phrases file to be input to mgpp. The biggest problem is
950	# reconciling the phrase identifiers used by the suffix program (which
951	# we'll call suffix-id numbers) with the numbers used in the thesaurus
952	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
953	# Phind-id numbers must be sorted by frequency of occurance.
954	#
955	# Start creating a set of phind-id numbers from the sorted suffix-id
956	# numbers and (if required) the thesaurus-id numbers. Then add any other
957	# phrases occuring in the thesaurus.
958	#
959	# The last thing we have to do is restore the vocabulary information to the
960	# phrase file so that the phrases are stored as words, not as symbol
961	# numbers.
962
963	# The original phrases file looks something like this:
964	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
965	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
966	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
967	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
968
969
970	sub renumber_phrases {
971	my ($self) = @_;
972
973	renumber_suffix_data($self);
974	renumber_thesaurus_data($self);
975	restore_vocabulary_data($self);
976
977	}
978
979
980
981	# renumber_suffix_data
982	#
983	# Translate phrases file to phrases.2 using phind keys instead
984	# of suffix keys and sorting the expansion data.
985
986	sub renumber_suffix_data {
987	my ($self) = @_;
988
989	my $verbosity = $self->{'verbosity'};
990	my $out = $self->{'outhandle'};
991	print $out "Translate phrases: suffix-ids become phind-id's\n"
992	if ($verbosity);
993
994	my $phinddir = $self->{'phinddir'};
995	my $infile = &util::filename_cat($phinddir, 'phrases');
996	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
997
998	# Read the phrase file. Calculate initial set of phind-id
999	# numbers and store (suffixid -> frequency) relation.
1000
1001	my %suffixtophind;
1002	my @phindfrequency;
1003	my (@fields, $suffixid);
1004	my $nextphind = 1;
1005
1006	open(IN, "<$infile");
1007	while(<IN>) {
1008
1009	chomp;
1010	@fields = split(/:/, $_);
1011
1012	# get next suffixid and phindid
1013	$suffixid = shift @fields;
1014	$suffixtophind{$suffixid} = $nextphind;
1015
1016	# store total frequency
1017	shift @fields;
1018	$totalfrequency[$nextphind] = shift @fields;
1019
1020	$nextphind++;
1021	}
1022	close IN;
1023
1024
1025	# Translate phrases file to phrases.2. Use phind keys (not suffix
1026	# keys), sort expansion and document occurance data in order of
1027	# descending frequency..
1028	open(IN, "<$infile");
1029	open(OUT, ">$outfile");
1030
1031	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1032	my (@documwents, @newexp, $k, $n);
1033	my $linenumber = 0;
1034
1035	while(<IN>) {
1036
1037	# read the line
1038	chomp;
1039	@fields = split(/:/, $_);
1040
1041	# get a phrase number for this line
1042	$suffixid = shift @fields;
1043	die unless (defined($suffixtophind{$suffixid}));
1044	$phindid = $suffixtophind{$suffixid};
1045
1046	# get the symbols in the phrase
1047	$text = shift @fields;
1048
1049	# output status information
1050	$linenumber++;
1051	if ($verbosity > 2) {
1052	if ($linenumber % 1000 == 0) {
1053	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1054	}
1055	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1056	}
1057
1058	# get the phrase frequency
1059	$tf = shift @fields;
1060
1061	# get the number of expansions
1062	$countexp = shift @fields;
1063
1064	# get the expansions, convert them into phind-id numbers, and sort them
1065	$expansions = shift @fields;
1066	@newexp = ();
1067	foreach $k (split(/,/, $expansions)) {
1068	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1069	$n = $suffixtophind{$k};
1070	push @newexp, $n;
1071	}
1072	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1073
1074	# get the number of documents
1075	$countdocs = shift @fields;
1076
1077	# get the documents and sort them
1078	$documents = shift @fields;
1079	$documents =~ s/d//g;
1080	@documents = split(/;/, $documents);
1081	@documents = sort by_doc_frequency @documents;
1082
1083	# output the phrase data
1084	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1085	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1086
1087	}
1088
1089	close IN;
1090	close OUT;
1091	}
1092
1093
1094	# renumber_thesaurus_data
1095	#
1096	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1097
1098	sub renumber_thesaurus_data {
1099	my ($self) = @_;
1100
1101	my $out = $self->{'outhandle'};
1102	my $verbosity = $self->{'verbosity'};
1103	my $thesaurus = $self->{'thesaurus'};
1104
1105	my $phinddir = $self->{'phinddir'};
1106	my $infile = &util::filename_cat($phinddir, "phrases.2");
1107	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1108
1109
1110	# If no thesaurus is defined, simply move the phrases file.
1111	if (!$thesaurus) {
1112	print $out "Translate phrases.2: no thesaurus data\n"
1113	if ($verbosity);
1114	&util::mv($infile, $outfile);
1115	return;
1116	}
1117
1118	print $out "Translate phrases.2: add thesaurus data\n"
1119	if ($verbosity);
1120
1121	# 1.
1122	# Read thesaurus file and store (symbols->thesaurusid) mapping
1123	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1124	my %symbolstothesid;
1125	my (@fields, $thesid, $symbols);
1126
1127	open(TH, "<$thesaurusfile");
1128
1129	while (<TH>) {
1130
1131	chomp;
1132	@fields = split(/:/, $_);
1133
1134	# get id and text
1135	$thesid = shift @fields;
1136	$symbols = shift @fields;
1137	$symbolstothesid{$symbols} = $thesid;
1138	}
1139	close TH;
1140
1141	# 2.
1142	# Read phrases file to find thesaurus entries that already
1143	# have a phindid. Store their phind-ids for later translation,
1144	# and store their frequency for later sorting.
1145	my %thesaurustophindid;
1146	my %phindidtofrequency;
1147	my ($phindid, $freq);
1148
1149	open(IN, "<$infile");
1150
1151	while(<IN>) {
1152
1153	chomp;
1154	@fields = split(/:/, $_);
1155
1156	# phindid and symbols for this line
1157	$phindid = shift @fields;
1158	$symbols = shift @fields;
1159	$freq = shift @fields;
1160
1161	# do we have a thesaurus id corresponding to this phrase?
1162	if (defined($symbolstothesid{$symbols})) {
1163	$thesid = $symbolstothesid{$symbols};
1164	$thesaurustophindid{$thesid} = $phindid;
1165	$phindidtofrequency{$phindid} = $freq;
1166	}
1167	}
1168	close IN;
1169
1170	undef %symbolstothesid;
1171
1172	# 3.
1173	# Create phind-id numbers for remaining thesaurus entries,
1174	# and note that their frequency is 0 for later sorting.
1175	my $nextphindid = $phindid + 1;
1176
1177	open(TH, "<$thesaurusfile");
1178	while(<TH>) {
1179
1180	chomp;
1181	@fields = split(/:/, $_);
1182
1183	# read thesaurus-id and ensure it has a corresponding phind-id
1184	$thesid = shift @fields;
1185	if (!defined($thesaurustophindid{$thesid})) {
1186	$thesaurustophindid{$thesid} = $nextphindid;
1187	$phindidtofrequency{$nextphindid} = 0;
1188	$nextphindid++;
1189	}
1190	}
1191	close TH;
1192
1193	# 4.
1194	# Translate thesaurus file, replacing thesaurus-id numbers with
1195	# phind-id numbers.
1196	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1197	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1198	my (@links, $linkid, %linkidtotype, $newrelation);
1199
1200	open(TH, "<$thesaurusfile");
1201	open(TO, ">$newthesaurusfile");
1202	while(<TH>) {
1203
1204	chomp;
1205	@fields = split(/:/, $_);
1206
1207	# phindid and symbols for this line
1208	($thesid, $symbols, $relations) = @fields;
1209
1210	die unless ($thesid && $symbols);
1211	die unless $thesaurustophindid{$thesid};
1212	$phindid = $thesaurustophindid{$thesid};
1213
1214	# convert each part of the relation string to use phind-id numbers
1215	# at the same time, we want to sort the list by frequency.
1216	undef %linkidtotype;
1217
1218	foreach $linktext (split(/;/, $relations)) {
1219	@linkdata = split(/,/, $linktext);
1220
1221	# remember the linktype (e.g. BT, NT)
1222	$linktype = shift @linkdata;
1223
1224	# store the type of each link
1225	foreach $thesid (@linkdata) {
1226	die unless (defined($thesaurustophindid{$thesid}));
1227	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1228	}
1229	}
1230
1231	# sort the list of links, first by frequency, then by type.
1232	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1233	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1234	$linkcounter = (scalar @links);
1235
1236	# create a string describing the link information
1237	$linktype = $linkidtotype{$links[0]};
1238	$newrelation = $linktype;
1239	foreach $linkid (@links) {
1240	if ($linkidtotype{$linkid} ne $linktype) {
1241	$linktype = $linkidtotype{$linkid};
1242	$newrelation .= ";" . $linktype;
1243	}
1244	$newrelation .= "," . $linkid;
1245	}
1246	$newrelation .= ";";
1247
1248
1249	# output the new line
1250	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1251	}
1252	close TH;
1253	close TO;
1254
1255	undef %thesaurustophindid;
1256	undef %linkidtotype;
1257	undef %phindidtofrequency;
1258
1259	# 5.
1260	# Read thesaurus data (in phind-id format) into memory
1261	my %thesaurusdata;
1262
1263	open(TH, "<$newthesaurusfile");
1264	while(<TH>) {
1265	chomp;
1266	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1267	die unless ($phindid && $symbols);
1268	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1269	}
1270	close TH;
1271
1272	# 6.
1273	# Add thesaurus data to phrases file
1274	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1275	my (@documwents, @newexp, $k, $n);
1276	my $linenumber = 0;
1277
1278	open(IN, "<$infile");
1279	open(OUT, ">$outfile");
1280
1281	# Update existing phrases
1282	while(<IN>) {
1283
1284	chomp;
1285	@fields = split(/:/, $_);
1286
1287	# get data for this line
1288	$phindid = shift @fields;
1289
1290	# output the phrase data, with thesaurus information
1291	print OUT "$phindid:", join(":", @fields);
1292
1293	# add thesaurus data
1294	if (defined($thesaurusdata{$phindid})) {
1295	@fields = split(/:/, $thesaurusdata{$phindid});
1296	shift @fields;
1297	$linkcounter = shift @fields;
1298	$relations = shift @fields;
1299
1300	print OUT ":$linkcounter:$relations";
1301	$thesaurusdata{$phindid} = "";
1302	}
1303	print OUT "\n";
1304	}
1305	close IN;
1306
1307	# Add phrases that aren't already in the file
1308	foreach $phindid (sort numerically keys %thesaurusdata) {
1309	next unless ($thesaurusdata{$phindid});
1310
1311	@fields = split(/:/, $thesaurusdata{$phindid});
1312	$symbols = shift @fields;
1313	$linkcounter = shift @fields;
1314	$relations = shift @fields;
1315
1316	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1317	}
1318	close OUT;
1319
1320	}
1321
1322	# restore_vocabulary_data
1323	#
1324	# Read phrases.3 and restore vocabulary information. Then write
1325	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1326	# (if requested) to the saved phrases file.
1327
1328	sub restore_vocabulary_data {
1329	my ($self) = @_;
1330
1331	my $out = $self->{'outhandle'};
1332	my $verbosity = $self->{'verbosity'};
1333	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1334
1335	my $phinddir = $self->{'phinddir'};
1336	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1337	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1338	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1339	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1340
1341	my $savephrases = $self->{'savephrases'};
1342
1343	# 1.
1344	# Read the vocabulary file
1345	open(V, "<$vocabfile")
1346	\|\| die "Cannot open $vocabfile: $!";
1347	my @symbol;
1348	my $i = 1;
1349	while(<V>) {
1350	chomp;
1351	$symbol[$i++] = $_;
1352	}
1353	close V;
1354
1355	# 2.
1356	# Translate phrases.3 to MGPP input files
1357	my ($key, $text, $word, $isThesaurus);
1358	my @fields;
1359	my $linenumber = 0;
1360
1361	open(IN, "<$infile");
1362	open(DATA, ">$datafile");
1363	open(WORD, ">$wordfile");
1364
1365	# Save the phrases in a separate text file
1366	if ($savephrases) {
1367	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1368	open(SAVE, ">$savephrases");
1369	}
1370
1371	while(<IN>) {
1372
1373	# read the line
1374	chomp;
1375	$line = $_;
1376	@fields = split(/:/, $line);
1377
1378	# get a phrase number for this line
1379	$key = shift @fields;
1380
1381	# restore the text of the phrase
1382	$text = shift @fields;
1383	$text =~ s/s(\d+)/$symbol[$1]/g;
1384	if ($text =~ / /) {
1385	$word = "";
1386	} elsif ($text ne 'untranslated') {
1387	$word = $text;
1388	}
1389
1390	# output the phrase data
1391	print DATA "<Document>";
1392	print DATA "$key:$text:", join(":", @fields), ":\n";
1393
1394	# output the word index search data
1395	print WORD "<Document>$word\n";
1396
1397	# output the phrases to a text file
1398	if ($savephrases) {
1399	if ((scalar @fields) == 7) {
1400	$isThesaurus = 1;
1401	} else {
1402	$isThesaurus = 0;
1403	}
1404	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1405	}
1406	}
1407	close IN;
1408	close WORD;
1409	close DATA;
1410	close SAVE if ($savephrases);
1411
1412	}
1413
1414
1415
1416	# sort routines used to renumber phrases
1417
1418	sub numerically { $a <=> $b }
1419
1420	sub by_doc_frequency {
1421	my $fa = 1;
1422	if ($a =~ /,/) {
1423	$fa = $a;
1424	$fa =~ s/\d+,//;
1425	}
1426	my $fb = 1;
1427	if ($b =~ /,/) {
1428	$fb = $b;
1429	$fb =~ s/\d+,//;
1430	}
1431
1432	return ($fb <=> $fa);
1433	}
1434
1435	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: