Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 1897

Last change on this file since 1897 was 1897, checked in by paynter, 23 years ago
Convert_gml_into_tokens function a little more language tolerant, and the thesaurus appriate to the classifier's language is used when multiple languages are available.
Property svn:keywords set to `Author Date Id Revision`
File size: 37.9 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl phind" at the command line for a summary.
32
33	package phind;
34
35	use BasClas;
36	use util;
37	use ghtml;
38	use unicode;
39
40	sub BEGIN {
41	@ISA = ('BasClas');
42	}
43
44
45	sub print_usage {
46	print STDERR "
47	usage: classify phind [options]
48
49	options:
50	-text Fields The text used to build the phrase hierarchy.
51	(default: 'section:Title,section:text')
52
53	-title Title The metadata field used to describe each document.
54	(default: 'Title')
55
56	-button Name The label for the classifier screen and button in
57	navigation bar.
58	(default: 'Phrase')
59
60	-language Regex Language or languages to use building hierarchy.
61	Languages are identified by two-letter country codes
62	like en (English), es (Spanish), and fr (French).
63	Language is a regular expression, so 'en\|fr' (English or
64	French) and '..' (match any language) are valid.
65	(default: 'en'.)
66
67	-savephrases File If set, the phrase infomation will be stored in
68	the given file as text. It is probably a good idea
69	to use an absolute path.
70	(defualt: not set)
71
72	-suffixmode N The smode parameter to the phrase extraction program. A
73	value of 0 means that stopwords are ignored, and of 1
74	means that stopwords are used.
75	(default: 1)
76
77	-thesaurus Name Name of a thesaurus stored in phind format in the
78	collection's etc directory.
79	(default: not set)
80
81	-untidy Don't remove working files.
82
83	"; }
84
85
86	# Phrase delimiter symbols - these should be abstracted out someplace
87
88	my $colstart = "COLLECTIONSTART";
89	my $colend = "COLLECTIONEND";
90	my $doclimit = "DOCUMENTLIMIT";
91	my $senlimit = "SENTENCELIMIT";
92	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
93
94
95	# Create a new phind browser based on collect.cfg
96
97	sub new {
98	my $class = shift (@_);
99	my $self = new BasClas($class, @_);
100
101	my $out = $self->{'outhandle'};
102
103
104	# Phind installation check
105	# The phind phrase browser is research software and is not installed
106	# by defualt. If the user attepts to use it we warn them that it's a
107	# bit dodgy, then tell them how to install it. If they can do that
108	# and get all the files in place, then we let them proceed.
109
110	print $out "Checking Phind phrase browser requirements...\n";
111
112	# Make sure we're not in windows
113	if ($ENV{'GSDLOS'} =~ /windows/i) {
114	print STDERR "Sorry - Phind currently only works under Unix";
115	exit(1);
116	}
117
118	# Ensure the Phind generate scripts are in place
119	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
120	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
121
122	if (!(-e $file1)) {
123	print STDERR "The phind \"suffix\" program is not installed. ";
124	print STDERR "To install it, change to the directory\n";
125	print STDERR " $src\n";
126	print STDERR "and type \"make install-phind\".\n\n";
127	exit(1);
128	}
129
130	# Ensure the Phind CGI script is in place
131	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
132	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
133
134	if (!(-e $file1)) {
135	print STDERR "The phind CGI program is not installed. ";
136	print STDERR "To install it, change to the directory\n";
137	print STDERR " $src\n";
138	print STDERR "and type \"make install-phind\".\n\n";
139	exit(1);
140	}
141
142	# Ensure the Phind Java applet is in place
143	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
144	$file1 = &util::filename_cat($src, "Phind.class");
145
146	if (!(-e $file1)) {
147	print STDERR "The phind Java classes are not compiled. ";
148	print STDERR "To compile them, change to the directory\n";
149	print STDERR " $src\n";
150	print STDERR "and use your Java compiler to compile Phind.java.\n";
151	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
152	exit(1);
153	}
154
155	# Parse classifier arguments
156	my $builddir = "";
157	my $phinddir = "";
158	if (!parsargv::parse(\@_,
159	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
160	q^title/.*/Title^, \$self->{'titlefield'},
161	q^button/.*/Phrase^, \$self->{'buttonname'},
162	q^language/.*/en^, \$language,
163	q^builddir/.*/^, \$builddir,
164	q^savephrases/\d/0^, \$self->{'savephrases'},
165	q^suffixmode/\d/1^, \$self->{'suffixmode'},
166	q^thesaurus/.*/^, \$self->{'thesaurus'},
167	q^untidy^, \$self->{'untidy'},
168	"allow_extra_options")) {
169
170	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
171	&print_usage();
172	die "\n";
173	}
174
175	# classifier information
176	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
177
178	# limit languages
179	$self->{'language_exp'} = $language;
180
181	# collection directories
182	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
183	if (!$builddir) {
184	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
185	}
186	$self->{'builddir'} = $builddir;
187	$self->{'phinddir'} = &util::filename_cat($builddir, "phind");
188
189	return bless $self, $class;
190	}
191
192
193	# Initialise the phind classifier
194
195	sub init {
196	my $self = shift (@_);
197
198	# ensure we have a build directory
199	my $builddir = $self->{'builddir'};
200	die unless (-e "$builddir");
201
202	# create phind directory
203	my $phinddir = $self->{'phinddir'};
204	if (-e "$phinddir") {
205	&util::rm_r("$phinddir");
206	}
207	&util::mk_dir("$phinddir");
208
209	# open filehandles for documents and text
210	my $clausefile = &util::filename_cat("$phinddir", "clauses");
211	&util::rm($clausefile) if (-e $clausefile);
212	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
213	$self->{'txthandle'} = TEXT;
214
215	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
216	&util::rm($docfile) if (-e $docfile);
217	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
218	$self->{'dochandle'} = DOCS;
219
220	}
221
222
223	# Classify each document.
224	#
225	# Each document is passed here in turn. The classifier extracts the
226	# text of each and stores it in the clauses file. Document details are
227	# stored in the docs.txt file.
228
229	sub classify {
230	my $self = shift (@_);
231	my ($doc_obj) = @_;
232
233	my $verbosity = $self->{'verbosity'};
234	my $top_section = $doc_obj->get_top_section();
235
236	my $titlefield = $self->{'titlefield'};
237
238	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
239	print "process: $title\n" if ($verbosity > 2);
240
241	# Only consider the file if it is in the correct language
242	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
243	my $phrlanguage = $self->{'language_exp'};
244	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
245
246	# record this file
247	my $total++;
248	print "file $total: $file\n" if ($self->{'$verbosity'});
249
250
251	# Store document details
252	my $OID = $doc_obj->get_OID();
253	$OID = "NULL" unless defined $OID;
254	my $dochandle = $self->{'dochandle'};
255	print $dochandle "<Document>\t$OID\t$title\n";
256
257	# Store the text occuring in this object
258
259	# output the document delimiter
260	my $txthandle = $self->{'txthandle'};
261	print $txthandle "$doclimit\n";
262
263	# iterarate over the required indexes and store their text
264	my $indexes = $self->{'indexes'};
265	my $text = "";
266	my ($part, $level, $field, $section, $data, $dataref);
267
268	foreach $part (split(/,/, $indexes)) {
269
270	# Each field has a level and a data element ((e.g. document:Title)
271	($level, $field) = split(/:/, $part);
272	die unless ($level && $field);
273
274	# Extract the text from every section
275	# (In phind, document:text and section:text are equivalent)
276	if ($field eq "text") {
277	$data = "";
278	$section = $doc_obj->get_top_section();
279	while (defined($section)) {
280	$data .= $doc_obj->get_text($section) . "\n";
281	$section = $doc_obj->get_next_section($section);
282	}
283	$text .= convert_gml_to_tokens($data) . "\n";
284	}
285
286	# Extract a metadata field from a document
287	# (If ther eis more than one element of the given type, get them all.)
288	elsif ($level eq "document") {
289	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
290	foreach $data ($$dataref) {
291	$text .= convert_gml_to_tokens($data) . "\n";
292	}
293	}
294
295	# Extract metadata from every section in a document
296	elsif ($level eq "section") {
297	$data = "";
298	$section = $doc_obj->get_top_section();
299	while (defined($section)) {
300	$dataref .= $doc_obj->get_metadata($section, $field);
301	$data .= join("\n", $$dataref) . "\n";
302	$section = $doc_obj->get_next_section($section);
303	}
304	$text .= convert_gml_to_tokens($data) . "\n";
305	}
306
307	# Some sort of specification which I don't understand
308	else {
309	die "Unknown level ($level) in phind index ($part)\n";
310	}
311
312	}
313
314	# output the text
315	$text =~ tr/\n//s;
316	print $txthandle "$text";
317	}
318
319
320	# Construct the classifier from the information already gathered
321	#
322	# When get_classify_info is called, the clauses and docs.txt files have
323	# already been constructed in the phind directory. This function will
324	# translate them into compressed, indexed MGPP files that can be read by
325	# the phindcgi script. It will also register our classifier so that it
326	# shows up in the navigation bar.
327
328	sub get_classify_info {
329	my $self = shift (@_);
330
331	my $verbosity = $self->{'verbosity'};
332	my $out = $self->{'outhandle'};
333	my $phinddir = $self->{'phinddir'};
334
335	if ($verbosity) {
336	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
337	}
338
339	# Construct phind indexes
340	my $suffixmode = $self->{'suffixmode'};
341	my ($command, $status);
342
343	# Generate the vocabulary, symbol statistics, and numbers file
344	# from the clauses file
345	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
346	&extract_vocabulary($self);
347
348	# Use the suffix program to generate the phind/phrases file
349	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
350	&execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
351
352	# Create the phrase file and put phrase numbers in phind/phrases
353	print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
354	&renumber_phrases($self);
355
356	# Create the mg phrase database
357	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
358	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
359	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
360
361	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
362	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
363	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
364	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
365
366	print $out "\nCreating phrase databases\n";
367	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
368	my $mg_stem = "pdata";
369
370	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
371	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
372	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
373
374	# create the mg index of words
375	print $out "\nCreating word-level search indexes\n";
376	$mg_input = &util::filename_cat($phinddir, "pword.txt");
377	$mg_stem = "pword";
378
379	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
380	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
381	&execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
382	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
383	&execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
384	&execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
385
386	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
387	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
388	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
389
390	# create the mg document information database
391	print $out "\nCreating document information databases\n";
392	$mg_input = &util::filename_cat($phinddir, "docs.txt");
393	$mg_stem = "docs";
394
395	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
396	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
397	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
398
399
400	# Tidy up stray files
401	if (!$self->{'untidy'}) {
402	print $out "\nCleaning up\n" if ($verbosity > 2);
403	&util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
404	"$phinddir/clauses.vocab", "$phinddir/clauses.stats",
405	"$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
406	"$phinddir/pdata.txt", "$phinddir/pword.txt");
407	my $outfile = 1;
408	while (-e "$phinddir/outPhrase.$outfile") {
409	&util::rm("$phinddir/outPhrase.$outfile");
410	$outfile++;
411	}
412	}
413
414
415	# Insert the classifier into.... what?
416	my $collection = $self->{'collection'};
417	my $url = "library?a=p&p=phind&c=$collection";
418
419	my %classifyinfo = ('thistype'=>'Invisible',
420	'childtype'=>'Phind',
421	'Title'=>$self->{'buttonname'},
422	'contains'=>[]);
423
424	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
425	return \%classifyinfo;
426	}
427
428
429
430	sub convert_gml_to_tokens {
431
432	my ($text) = @_;
433	my $language_exp = $self->{'language_exp'};
434
435	if ($language_exp =~ /en/) {
436	return convert_gml_to_tokens_EN($text);
437	}
438
439	# FIRST, remove GML tags
440	$_ = $text;
441
442	# Replace all whitespace with a simple space
443	s/\s+/ /gso;
444
445	# Remove everything that is in a tag
446	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
447	s/\s<br>\s/ LINEBREAK /isgo;
448	s/<[^>]*>/ /sgo;
449
450	# Now we have the text, but it may contain HTML
451	# elements coded as > etc. Remove these tags.
452	s/</</sgo;
453	s/>/>/sgo;
454
455	s/\s+/ /sgo;
456	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
457	s/\s<br>\s/ LINEBREAK /isgo;
458	s/<[^>]*>/ /sgo;
459
460	# remove & and other miscellaneous markup tags
461	s/&/&/sgo;
462	s/</</sgo;
463	s/>/>/sgo;
464	s/&/&/sgo;
465
466	# replace<p> and <br> placeholders with carriage returns
467	s/PARAGRAPHBREAK/\n/sgo;
468	s/LINEBREAK/\n/sgo;
469
470
471	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse;
472
473
474	# Convert the remaining text to "clause format",
475
476	# This means removing all excess punctuation and garbage text,
477	# normalising valid punctuation to fullstops and commas,
478	# then putting one clause on each line.
479
480	# Insert newline when the end of a sentence is detected
481	# (delimter is: "[\.\?\!]\s")
482	s/\s*[\.\?\!]\s+/\n/go;
483
484	# split numbers after four digits
485	s/(\d\d\d\d)/$1 /go;
486
487	# remove extra whitespace
488	s/ +/ /sgo;
489	s/^\s+//mgo;
490	s/\s*$/\n/mgo;
491
492	# remove lines that contain one word or less
493	s/^\S*$//mgo;
494	s/^\s*$//mgo;
495	tr/\n//s;
496
497	return $_;
498	}
499
500	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
501
502	sub convert_gml_to_tokens_EN {
503	$_ = shift @_;
504
505	# FIRST, remove GML tags
506
507	# Replace all whitespace with a simple space
508	s/\s+/ /gs;
509
510	# Remove everything that is in a tag
511	s/\s<p>\s/ PARAGRAPHBREAK /isg;
512	s/\s<br>\s/ LINEBREAK /isg;
513	s/<[^>]*>/ /sg;
514
515	# Now we have the text, but it may contain HTML
516	# elements coded as > etc. Remove these tags.
517	s/</</sg;
518	s/>/>/sg;
519
520	s/\s+/ /sg;
521	s/\s<p>\s/ PARAGRAPHBREAK /isg;
522	s/\s<br>\s/ LINEBREAK /isg;
523	s/<[^>]*>/ /sg;
524
525	# remove & and other miscellaneous markup tags
526	s/&/&/sg;
527	s/</</sg;
528	s/>/>/sg;
529	s/&/&/sg;
530
531	# replace<p> and <br> placeholders with carriage returns
532	s/PARAGRAPHBREAK/\n/sg;
533	s/LINEBREAK/\n/sg;
534
535
536	# Exceptional punctuation
537	#
538	# We make special cases of some punctuation
539
540	# remove any apostrophe that indicates omitted letters
541	s/(\w+)\'(\w*\s)/ $1$2 /g;
542
543	# remove period that appears in a person's initals
544	s/\s([A-Z])\./ $1 /g;
545
546	# replace hyphens in hypheanted words and names with a space
547	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
548
549	# Convert the remaining text to "clause format",
550	# This means removing all excess punctuation and garbage text,
551	# normalising valid punctuation to fullstops and commas,
552	# then putting one cluse on each line.
553
554	# Insert newline when the end of a sentence is detected
555	# (delimter is: "[\.\?\!]\s")
556	s/\s*[\.\?\!]\s+/\n/g;
557
558	# split numbers after four digits
559	s/(\d\d\d\d)/$1 /g;
560
561	# split words after 32 characters
562
563	# squash repeated punctuation
564	tr/A-Za-z0-9 //cs;
565
566	# save email addresses
567	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
568
569	# normalise clause breaks (mostly punctuation symbols) to commas
570	s/[^A-Za-z0-9 \n]+/ , /g;
571
572	# Remove repeated commas, and replace with newline
573	s/\s*,[, ]+/\n/g;
574
575	# remove extra whitespace
576	s/ +/ /sg;
577	s/^\s+//mg;
578	s/\s*$/\n/mg;
579
580	# remove lines that contain one word or less
581	s/^\w*$//mg;
582	s/^\s*$//mg;
583	tr/\n//s;
584
585	return $_;
586
587	}
588
589
590
591	# Execute a system command
592
593	sub execute {
594	my ($command, $verbosity, $outhandle) = @_;
595	print $outhandle "Executing: $command\n" if ($verbosity > 2);
596	my $status = system($command);
597	if ($status != 0) {
598	print STDERR "phind - Error executing $command: $!\n";
599	exit($status);
600	}
601	}
602
603
604	# Generate the vocabulary, symbol statistics, and numbers file from the
605	# clauses file. This is legacy code, so is a bit messy and probably wont
606	# run under windows.
607
608	sub extract_vocabulary {
609	my ($self) = @_;
610
611	my $verbosity = $self->{'verbosity'};
612	my $out = $self->{'outhandle'};
613
614	my $collectiondir = $self->{'collectiondir'};
615	my $phinddir = $self->{'phinddir'};
616
617	my $language_exp = $self->{'language_exp'};
618
619	my ($w, $l, $line, $word);
620
621	my ($first_delimiter, $last_delimiter,
622	$first_stopword, $last_stopword,
623	$first_extractword, $last_extractword,
624	$first_contentword, $last_contentword,
625	$phrasedelimiter);
626
627	my $thesaurus = $self->{'thesaurus'};
628	my ($thesaurus_links, $thesaurus_terms,
629	%thesaurus, $first_thesaurusword, $last_thesaurusword);
630
631	my %symbol;
632	my (%freq);
633
634	print $out "Calculating vocabulary\n" if ($verbosity > 1);
635
636	# Read and store the stopwords
637	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
638	my $stopword_files = ();
639	my ($language, $language_dir, $file, $file_name);
640	my %stopwords;
641
642	# Examine each directory in the stopword directory
643	opendir(STOPDIR, $stopdir);
644	foreach $language (readdir STOPDIR) {
645
646	# Ignore entries that do not match the classifier's language
647	next unless ($language =~ /$language_exp/);
648	$language_dir = &util::filename_cat($stopdir, $language);
649	next unless (-d "$language_dir");
650
651	opendir(LANGDIR, $language_dir);
652	foreach $file (readdir LANGDIR) {
653
654	# Ignore entries that are not stopword files
655	next unless ($file =~ /sw$/);
656	$file_name = &util::filename_cat($language_dir, $file);
657	next unless (-f "$file_name");
658
659	# Read the stopwords
660	open(STOPFILE, "<$file_name");
661	while (<STOPFILE>) {
662	s/^\s+//;
663	s/\s.*//;
664	$word = $_;
665	$l = lc($word);
666	$stopwords{$l} = $word;
667	}
668	close STOPFILE;
669
670	}
671	}
672
673	# Read thesaurus information
674	if ($thesaurus) {
675
676	# link file exists
677	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
678	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
679
680	# ensure term file exists in the correct language
681	if ($language_exp =~ /^([a-z][a-z])/) {
682	$language = $1;
683	} else {
684	$language = 'en';
685	}
686	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
687	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
688
689
690	# Read the thesaurus terms
691	open(TH, "<$thesaurus_terms");
692	while(<TH>) {
693	s/^\d+ //;
694	s/\(.*\)//;
695	foreach $w (split(/\s+/, $_)) {
696	$thesaurus{lc($w)} = $w;
697	}
698	}
699	close TH;
700	}
701
702	# Read words in the text and count occurences
703	open(TXT, "<$phinddir/clauses");
704	my @words;
705
706	while(<TXT>) {
707	$line = $_;
708	next unless ($line =~ /./);
709
710	@words = split(/\s+/, $line);
711	foreach $w (@words) {
712	$l = lc($w);
713	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
714	$freq{$w}++;
715	}
716	$freq{$senlimit}++;
717	}
718
719	# Calculate the "best" form of each word
720	my (%bestform, %totalfreq, %bestfreq);
721
722	foreach $w (sort (keys %freq)) {
723	$l = lc($w);
724
725	# totalfreq is the number of times a term appears in any form
726	$totalfreq{$l} += $freq{$w};
727
728	if (defined $stopwords{$l}) {
729	$bestform{$l} = $stopwords{$l};
730
731	} elsif (defined $thesaurus{$l}) {
732	$bestform{$l} = $thesaurus{$l};
733
734	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
735	$bestfreq{$l} = $freq{$w};
736	$bestform{$l} = $w;
737	}
738	}
739
740	undef %freq;
741	undef %bestfreq;
742
743
744	# Assign symbol numbers to tokens
745	my $nextsymbol = 1;
746	my (@vocab);
747
748	# Delimiters
749	$first_delimiter = 1;
750
751	foreach $word (@delimiters) {
752
753	$word = lc($word);
754	$bestform{$word} = uc($word);
755	$vocab[$nextsymbol] = $word;
756	$symbol{$word} = $nextsymbol;
757	$nextsymbol++;
758	}
759	$last_delimiter = $nextsymbol - 1;
760
761	# Stopwords
762	$first_stopword = $nextsymbol;
763
764	foreach my $word (sort keys %stopwords) {
765
766	# don't incluse stopword unless it occurs in the text
767	$word = lc($word);
768	next unless ($totalfreq{$word});
769	next if ($symbol{$word});
770
771	$vocab[$nextsymbol] = $word;
772	$symbol{$word} = $nextsymbol;
773	$nextsymbol++;
774	}
775	$last_stopword = $nextsymbol - 1;
776	$first_contentword = $nextsymbol;
777
778	# Thesaurus terms
779	if ($thesaurus) {
780	$first_thesaurusword = $nextsymbol;
781
782	foreach my $word (sort keys %thesaurus) {
783
784	$word = lc($word);
785	next if ($symbol{$word});
786	$bestform{$word} = $thesaurus{$word};
787
788	$vocab[$nextsymbol] = $word;
789	$symbol{$word} = $nextsymbol;
790	$nextsymbol++;
791
792	}
793	$last_thesaurusword = $nextsymbol - 1;
794	}
795
796	# Other content words
797	$first_extractword = $nextsymbol;
798
799	foreach my $word (sort (keys %bestform)) {
800
801	next if ($symbol{$word});
802
803	$vocab[$nextsymbol] = $word;
804	$symbol{$word} = $nextsymbol;
805	$nextsymbol++;
806	}
807	$last_extractword = $nextsymbol - 1;
808	$last_contentword = $nextsymbol - 1;
809
810
811	# Outut the words
812	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
813	open(VOC, ">$phinddir/clauses.vocab");
814
815	for (my $i = 1; $i < $nextsymbol; $i++) {
816	$w = $vocab[$i];
817
818	print VOC "$bestform{$w}\n";
819	$totalfreq{$w} = 0 unless ($totalfreq{$w});
820	}
821	close VOC;
822
823
824	# Create statistics file
825	# Output statistics about the vocablary
826	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
827	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
828
829	open(STAT, ">$phinddir/clauses.stats")
830	\|\| die "Cannot open $phinddir/clauses.stats: $!";
831
832	print STAT "first_delimiter $first_delimiter\n";
833	print STAT "last_delimiter $last_delimiter\n";
834	print STAT "first_stopword $first_stopword\n";
835	print STAT "last_stopword $last_stopword\n";
836	if ($thesaurus) {
837	print STAT "first_thesaurusword $first_thesaurusword\n";
838	print STAT "last_thesaurusword $last_thesaurusword\n";
839	}
840	print STAT "first_extractword $first_extractword\n";
841	print STAT "last_extractword $last_extractword\n";
842	print STAT "first_contentword $first_contentword\n";
843	print STAT "last_contentword $last_contentword\n";
844	print STAT "first_symbol $first_delimiter\n";
845	print STAT "last_symbol $last_contentword\n";
846	print STAT "first_word $first_stopword\n";
847	print STAT "last_word $last_contentword\n";
848	close STAT;
849
850	undef @vocab;
851
852
853	# Create numbers file
854	# Save text as symbol numbers
855	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
856
857	open(TXT, "<$phinddir/clauses");
858	open(NUM, ">$phinddir/clauses.numbers");
859
860	$phrasedelimiter = $symbol{lc($senlimit)};
861	print NUM "$symbol{lc($colstart)}\n";
862
863	# set up the special symbols that delimit documents and sentences
864	while(<TXT>) {
865
866	# split sentence into a list of tokens
867	$line = $_;
868	next unless ($line =~ /./);
869	@words = split(/\s+/, $line);
870
871	# output one token at a time
872	foreach $word (@words) {
873	$word = lc($word);
874	print NUM "$symbol{$word}\n";
875	}
876
877	# output phrase delimiter
878	print NUM "$phrasedelimiter\n";
879	}
880
881	print NUM "$symbol{lc($colend)}\n";
882	close NUM;
883
884	# Save thesaurus data in one convienient file
885	if ($thesaurus) {
886
887	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
888
889
890	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
891	if ($verbosity > 1);
892
893	# Read the thesaurus terms
894	my ($num, $text, %thes_symbols);
895
896	open(TH, "<$thesaurus_terms");
897	while(<TH>) {
898	chomp;
899	@words = split(/\s+/, $_);
900	$num = shift @words;
901	$text = "";
902
903	# translate words into symbol numbers
904	foreach $word (@words) {
905	$word = lc($word);
906	if ($symbol{$word}) {
907	$text .= "s$symbol{$word} ";
908	} elsif ($verbosity) {
909	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
910	}
911	}
912	$text =~ s/ $//;
913	$thes_symbols{$num} = $text;
914	}
915	close TH;
916
917	# Read the thesaurus links and write the corresponding data
918	open(TH, "<$thesaurus_links");
919	open(THOUT, ">$thesaurusfile");
920
921	while(<TH>) {
922	chomp;
923	($num, $text) = split(/:/, $_);
924
925	if (defined($thes_symbols{$num})) {
926	print THOUT "$num:$thes_symbols{$num}:$text\n";
927	} else {
928	print THOUT "$num:untranslated:$text\n";
929	}
930	}
931	close TH;
932	close THOUT;
933	}
934
935
936
937
938	}
939
940
941	# renumber_phrases
942	#
943	# Prepare the phrases file to be input to mgpp. The biggest problem is
944	# reconciling the phrase identifiers used by the suffix program (which
945	# we'll call suffix-id numbers) with the numbers used in the thesaurus
946	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
947	# Phind-id numbers must be sorted by frequency of occurance.
948	#
949	# Start creating a set of phind-id numbers from the sorted suffix-id
950	# numbers and (if required) the thesaurus-id numbers. Then add any other
951	# phrases occuring in the thesaurus.
952	#
953	# The last thing we have to do is restore the vocabulary information to the
954	# phrase file so that the phrases are stored as words, not as symbol
955	# numbers.
956
957	# The original phrases file looks something like this:
958	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
959	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
960	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
961	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
962
963
964	sub renumber_phrases {
965	my ($self) = @_;
966
967	renumber_suffix_data($self);
968	renumber_thesaurus_data($self);
969	restore_vocabulary_data($self);
970
971	}
972
973
974
975	# renumber_suffix_data
976	#
977	# Translate phrases file to phrases.2 using phind keys instead
978	# of suffix keys and sorting the expansion data.
979
980	sub renumber_suffix_data {
981	my ($self) = @_;
982
983	my $verbosity = $self->{'verbosity'};
984	my $out = $self->{'outhandle'};
985	print $out "Translate phrases: suffix-ids become phind-id's\n"
986	if ($verbosity);
987
988	my $phinddir = $self->{'phinddir'};
989	my $infile = &util::filename_cat($phinddir, 'phrases');
990	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
991
992	# Read the phrase file. Calculate initial set of phind-id
993	# numbers and store (suffixid -> frequency) relation.
994
995	my %suffixtophind;
996	my @phindfrequency;
997	my (@fields, $suffixid);
998	my $nextphind = 1;
999
1000	open(IN, "<$infile");
1001	while(<IN>) {
1002
1003	chomp;
1004	@fields = split(/:/, $_);
1005
1006	# get next suffixid and phindid
1007	$suffixid = shift @fields;
1008	$suffixtophind{$suffixid} = $nextphind;
1009
1010	# store total frequency
1011	shift @fields;
1012	$totalfrequency[$nextphind] = shift @fields;
1013
1014	$nextphind++;
1015	}
1016	close IN;
1017
1018
1019	# Translate phrases file to phrases.2. Use phind keys (not suffix
1020	# keys), sort expansion and document occurance data in order of
1021	# descending frequency..
1022	open(IN, "<$infile");
1023	open(OUT, ">$outfile");
1024
1025	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1026	my (@documwents, @newexp, $k, $n);
1027	my $linenumber = 0;
1028
1029	while(<IN>) {
1030
1031	# read the line
1032	chomp;
1033	@fields = split(/:/, $_);
1034
1035	# get a phrase number for this line
1036	$suffixid = shift @fields;
1037	die unless (defined($suffixtophind{$suffixid}));
1038	$phindid = $suffixtophind{$suffixid};
1039
1040	# get the symbols in the phrase
1041	$text = shift @fields;
1042
1043	# output status information
1044	$linenumber++;
1045	if ($verbosity > 2) {
1046	if ($linenumber % 1000 == 0) {
1047	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1048	}
1049	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1050	}
1051
1052	# get the phrase frequency
1053	$tf = shift @fields;
1054
1055	# get the number of expansions
1056	$countexp = shift @fields;
1057
1058	# get the expansions, convert them into phind-id numbers, and sort them
1059	$expansions = shift @fields;
1060	@newexp = ();
1061	foreach $k (split(/,/, $expansions)) {
1062	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1063	$n = $suffixtophind{$k};
1064	push @newexp, $n;
1065	}
1066	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1067
1068	# get the number of documents
1069	$countdocs = shift @fields;
1070
1071	# get the documents and sort them
1072	$documents = shift @fields;
1073	$documents =~ s/d//g;
1074	@documents = split(/;/, $documents);
1075	@documents = sort by_doc_frequency @documents;
1076
1077	# output the phrase data
1078	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1079	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1080
1081	}
1082
1083	close IN;
1084	close OUT;
1085	}
1086
1087
1088	# renumber_thesaurus_data
1089	#
1090	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1091
1092	sub renumber_thesaurus_data {
1093	my ($self) = @_;
1094
1095	my $out = $self->{'outhandle'};
1096	my $verbosity = $self->{'verbosity'};
1097	my $thesaurus = $self->{'thesaurus'};
1098
1099	my $phinddir = $self->{'phinddir'};
1100	my $infile = &util::filename_cat($phinddir, "phrases.2");
1101	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1102
1103
1104	# If no thesaurus is defined, simply move the phrases file.
1105	if (!$thesaurus) {
1106	print $out "Translate phrases.2: no thesaurus data\n"
1107	if ($verbosity);
1108	&util::mv($infile, $outfile);
1109	return;
1110	}
1111
1112	print $out "Translate phrases.2: add thesaurus data\n"
1113	if ($verbosity);
1114
1115	# 1.
1116	# Read thesaurus file and store (symbols->thesaurusid) mapping
1117	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1118	my %symbolstothesid;
1119	my (@fields, $thesid, $symbols);
1120
1121	open(TH, "<$thesaurusfile");
1122
1123	while (<TH>) {
1124
1125	chomp;
1126	@fields = split(/:/, $_);
1127
1128	# get id and text
1129	$thesid = shift @fields;
1130	$symbols = shift @fields;
1131	$symbolstothesid{$symbols} = $thesid;
1132	}
1133	close TH;
1134
1135	# 2.
1136	# Read phrases file to find thesaurus entries that already
1137	# have a phindid. Store their phind-ids for later translation.
1138	my %thesaurustophindid;
1139	my ($phindid);
1140
1141	open(IN, "<$infile");
1142
1143	while(<IN>) {
1144
1145	chomp;
1146	@fields = split(/:/, $_);
1147
1148	# phindid and symbols for this line
1149	$phindid = shift @fields;
1150	$symbols = shift @fields;
1151
1152	# do we have a thesaurus id corresponding to this phrase?
1153	if (defined($symbolstothesid{$symbols})) {
1154	$thesid = $symbolstothesid{$symbols};
1155	$thesaurustophindid{$thesid} = $phindid;
1156	}
1157	}
1158	close IN;
1159
1160	undef %symbolstothesid;
1161
1162	# 3.
1163	# Create phind-id numbers for remaining thesaurus entries
1164	my $nextphindid = $phindid + 1;
1165
1166	open(TH, "<$thesaurusfile");
1167	while(<TH>) {
1168
1169	chomp;
1170	@fields = split(/:/, $_);
1171
1172	# read thesaurus-id and ensure it has a corresponding phind-id
1173	$thesid = shift @fields;
1174	if (!defined($thesaurustophindid{$thesid})) {
1175	$thesaurustophindid{$thesid} = $nextphindid;
1176	$nextphindid++;
1177	}
1178	}
1179	close TH;
1180
1181	# 4.
1182	# Translate thesaurus file, replacing thesaurus-id numbers with
1183	# phind-id numbers.
1184	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1185	my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1186
1187	open(TH, "<$thesaurusfile");
1188	open(TO, ">$newthesaurusfile");
1189	while(<TH>) {
1190
1191	chomp;
1192	@fields = split(/:/, $_);
1193
1194	# phindid and symbols for this line
1195	($thesid, $symbols, $relations) = @fields;
1196
1197	die unless ($thesid && $symbols);
1198	die unless $thesaurustophindid{$thesid};
1199	$phindid = $thesaurustophindid{$thesid};
1200
1201	# convert each part of the relation string to use phind-id numbers
1202	$newrelation = "";
1203	$linkcounter = 0;
1204	foreach $linktext (split(/;/, $relations)) {
1205	@linkdata = split(/,/, $linktext);
1206
1207	# remember the linktype (e.g. BT, NT)
1208	$linktype = shift @linkdata;
1209	$newrelation .= "$linktype,";
1210
1211	# convert the link target identfiers
1212	foreach $link (@linkdata) {
1213	die unless (defined($thesaurustophindid{$link}));
1214	$newrelation .= "$thesaurustophindid{$link},";
1215	$linkcounter++;
1216	}
1217	$newrelation =~ s/\,$//;
1218	$newrelation .= ";";
1219	}
1220	$newrelation .= ":";
1221
1222	print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1223	}
1224	close TH;
1225	close TO;
1226
1227	undef %thesaurustophindid;
1228
1229	# 5.
1230	# Read thesaurus data (in phind-id format) into memory
1231	my %thesaurusdata;
1232
1233	open(TH, "<$newthesaurusfile");
1234	while(<TH>) {
1235	chomp;
1236	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1237	die unless ($phindid && $symbols);
1238	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1239	}
1240
1241	# 6.
1242	# Add thesaurus data to phrases file
1243	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1244	my (@documwents, @newexp, $k, $n);
1245	my $linenumber = 0;
1246
1247	open(IN, "<$infile");
1248	open(OUT, ">$outfile");
1249
1250	# Update existing phrases
1251	while(<IN>) {
1252
1253	chomp;
1254	@fields = split(/:/, $_);
1255
1256	# get data for this line
1257	$phindid = shift @fields;
1258
1259	# output the phrase data, with thesaurus information
1260	print OUT "$phindid:", join(":", @fields);
1261
1262	# add thesaurus data
1263	if (defined($thesaurusdata{$phindid})) {
1264	@fields = split(/:/, $thesaurusdata{$phindid});
1265	shift @fields;
1266	$linkcounter = shift @fields;
1267	$relations = shift @fields;
1268
1269	print OUT ":$linkcounter:$relations";
1270	$thesaurusdata{$phindid} = "";
1271	}
1272	print OUT "\n";
1273	}
1274	close IN;
1275
1276	# Add phrases that aren't already in the file
1277	foreach $phindid (sort numerically keys %thesaurusdata) {
1278	next unless ($thesaurusdata{$phindid});
1279
1280	@fields = split(/:/, $thesaurusdata{$phindid});
1281	$symbols = shift @fields;
1282	$linkcounter = shift @fields;
1283	$relations = shift @fields;
1284
1285	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1286	}
1287	close OUT;
1288
1289	}
1290
1291	# restore_vocabulary_data
1292	#
1293	# Read phrases.3 and restore vocabulary information. Then write
1294	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1295	# (if requested) to the saved phrases file.
1296
1297	sub restore_vocabulary_data {
1298	my ($self) = @_;
1299
1300	my $out = $self->{'outhandle'};
1301	my $verbosity = $self->{'verbosity'};
1302	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1303
1304	my $phinddir = $self->{'phinddir'};
1305	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1306	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1307	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1308	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1309
1310	my $savephrases = $self->{'savephrases'};
1311
1312	# 1.
1313	# Read the vocabulary file
1314	open(V, "<$vocabfile")
1315	\|\| die "Cannot open $vocabfile: $!";
1316	my @symbol;
1317	my $i = 1;
1318	while(<V>) {
1319	chomp;
1320	$symbol[$i++] = $_;
1321	}
1322
1323
1324	# 2.
1325	# Translate phrases.3 to MGPP input files
1326	my ($key, $text, $word);
1327	my @fields;
1328	my $linenumber = 0;
1329
1330	open(IN, "<$infile");
1331	open(DATA, ">$datafile");
1332	open(WORD, ">$wordfile");
1333
1334	# Save the phrases in a separate text file
1335	if ($savephrases) {
1336	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1337	open(SAVE, ">$savephrases");
1338	}
1339
1340	while(<IN>) {
1341
1342	# read the line
1343	chomp;
1344	$line = $_;
1345	@fields = split(/:/, $line);
1346
1347	# get a phrase number for this line
1348	$key = shift @fields;
1349
1350	# restore the text of the phrase
1351	$text = shift @fields;
1352	$text =~ s/s(\d+)/$symbol[$1]/g;
1353	if ($text =~ / /) {
1354	$word = "";
1355	} elsif ($text ne 'untranslated') {
1356	$word = $text;
1357	}
1358
1359	# output the phrase data
1360	print DATA "<Document>";
1361	print DATA "$key:$text:", join(":", @fields), ":\n";
1362
1363	# output the word index search data
1364	print WORD "<Document>$word\n";
1365
1366	# output the phrases to a text file
1367	if ($savephrases) {
1368	print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1369	}
1370	}
1371	close IN;
1372	close WORD;
1373	close DATA;
1374	close SAVE if ($savephrases);
1375
1376	}
1377
1378
1379
1380	# sort routines used to renumber phrases
1381
1382	sub numerically { $a <=> $b }
1383
1384	sub by_doc_frequency {
1385	my $fa = 1;
1386	if ($a =~ /,/) {
1387	$fa = $a;
1388	$fa =~ s/\d+,//;
1389	}
1390	my $fb = 1;
1391	if ($b =~ /,/) {
1392	$fb = $b;
1393	$fb =~ s/\d+,//;
1394	}
1395
1396	return ($fb <=> $fa);
1397	}
1398
1399	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: