Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 1883

Last change on this file since 1883 was 1883, checked in by paynter, 23 years ago
Supports new parameters of suffix program and new stopword file locations. Filtering based on language now works. print_usage updated.
Property svn:keywords set to `Author Date Id Revision`
File size: 37.6 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin
30	#
31	# options are:
32	# -button Name The label for the classifiers button in the
33	# navigation bar (defaults to "Phrase").
34	# -title Title The metadata field used to describe each document
35	# (defaults to "Title").
36	# -text fields The text used to build the phrase hierarchy
37	# (defaults to "section:Title,section:text").
38	# -phinddir directory Location of phind index files
39	# -verbosity num Control amount of output
40	# -untidy Do not clean up intermediate files
41	# -suffixmode num Mode of suffix program (0 = all phrases, 1 = stopword)
42	# -savephrases filename If set, phrase infomation will be stored in filename
43	# as text. (By defualt, it is not set.)
44	# -thesaurus name Name of a thesaurus stred in phind format in etc dir.
45
46	# How a classifier works.
47	#
48	# For each classifier requested in the collect.cfg file, buildcol.pl creates
49	# a new classifier object (such as the one defined in theis file) and later
50	# passes each document object to the classifier in turn for classification.
51	#
52	# Four functions are used:
53	#
54	# 1. "new" is called before the documents are processed to set up the
55	# classifier.
56	#
57	# 2. "init" is called after buildcol.pl has created the indexes etc but
58	# before the documents are classified in order that the classifier might
59	# set any variables it requires, etc.
60	#
61	# 3. "classify" is called once for each document object. The classifier
62	# "classifies" each document and updates its local data accordingly.
63	#
64	# 4. "get_classify_info" is called after every document has been
65	# classified. It collates the information about the documents and
66	# stores a reference to the classifier so that Greenstone can later
67	# display it.
68
69
70	package phind;
71
72	use BasClas;
73	use util;
74
75	sub BEGIN {
76	@ISA = ('BasClas');
77	}
78
79	# Define delimiter symbols - this should be abstracted out someplace
80	my $colstart = "COLLECTIONSTART";
81	my $colend = "COLLECTIONEND";
82	my $doclimit = "DOCUMENTLIMIT";
83	my $senlimit = "SENTENCELIMIT";
84	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
85
86
87	sub print_usage {
88	print STDERR "
89	usage: classify phind [options]
90
91	options:
92	-text Fields The text used to build the phrase hierarchy.
93	(default: 'section:Title,section:text')
94
95	-title Title The metadata field used to describe each document.
96	(default: 'Title')
97
98	-button Name The label for the classifier screen and button in
99	navigation bar.
100	(default: 'Phrase')
101
102	-language Regex Language or languages to use building hierarchy.
103	Languages are identified by two-letter country codes
104	like en (English), es (Spanish), and fr (French).
105	Language is a regular expression, so 'en\|fr' (English or
106	French) and '..' (match any language) are valid.
107	(default: 'en'.)
108
109	-savephrases File If set, the phrase infomation will be stored in
110	the given file as text. It is probably a good idea
111	to use an absolute path.
112	(defualt: not set)
113
114	-suffixmode N The smode parameter to the phrase extraction program. A
115	value of 0 means that stopwords are ignored, and of 1
116	means that stopwords are used.
117	(default: 1)
118
119	-thesaurus Name Name of a thesaurus stored in phind format in the
120	collection's etc directory.
121	(default: not set)
122
123	-untidy Don't remove working files.
124
125	"; }
126
127	# Create a new phind browser based on collect.cfg
128
129	sub new {
130	my $class = shift (@_);
131	my $self = new BasClas($class, @_);
132
133	my $out = $self->{'outhandle'};
134
135
136	# Phind installation check
137	# The phind phrase browser is research software and is not installed
138	# by defualt. If the user attepts to use it we warn them that it's a
139	# bit dodgy, then tell them how to install it. If they can do that
140	# and get all the files in place, then we let them proceed.
141
142	print $out "Checking Phind phrase browser requirements...\n";
143
144	# Make sure we're not in windows
145	if ($ENV{'GSDLOS'} =~ /windows/i) {
146	print STDERR "Sorry - Phind currently only works under Unix";
147	exit(1);
148	}
149
150	# Ensure the Phind generate scripts are in place
151	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
152	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
153
154	if (!(-e $file1)) {
155	print STDERR "The phind \"suffix\" program is not installed. ";
156	print STDERR "To install it, change to the directory\n";
157	print STDERR " $src\n";
158	print STDERR "and type \"make install-phind\".\n\n";
159	exit(1);
160	}
161
162	# Ensure the Phind CGI script is in place
163	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
164	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
165
166	if (!(-e $file1)) {
167	print STDERR "The phind CGI program is not installed. ";
168	print STDERR "To install it, change to the directory\n";
169	print STDERR " $src\n";
170	print STDERR "and type \"make install-phind\".\n\n";
171	exit(1);
172	}
173
174	# Ensure the Phind Java applet is in place
175	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
176	$file1 = &util::filename_cat($src, "Phind.class");
177
178	if (!(-e $file1)) {
179	print STDERR "The phind Java classes are not compiled. ";
180	print STDERR "To compile them, change to the directory\n";
181	print STDERR " $src\n";
182	print STDERR "and use your Java compiler to compile Phind.java.\n";
183	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
184	exit(1);
185	}
186
187	# Parse classifier arguments
188	my $builddir = "";
189	my $phinddir = "";
190	if (!parsargv::parse(\@_,
191	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
192	q^title/.*/Title^, \$self->{'titlefield'},
193	q^button/.*/Phrase^, \$self->{'buttonname'},
194	q^language/.*/en^, \$language,
195	q^builddir/.*/^, \$builddir,
196	q^savephrases/\d/0^, \$self->{'savephrases'},
197	q^suffixmode/\d/1^, \$self->{'suffixmode'},
198	q^thesaurus/.*/^, \$self->{'thesaurus'},
199	q^untidy^, \$self->{'untidy'},
200	"allow_extra_options")) {
201
202	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
203	&print_usage();
204	die "\n";
205	}
206
207	# classifier information
208	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
209
210	# limit languages
211	$self->{'language_exp'} = $language;
212
213	# collection directories
214	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
215	if (!$builddir) {
216	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
217	}
218	$self->{'builddir'} = $builddir;
219	$self->{'phinddir'} = &util::filename_cat($builddir, "phind");
220
221	return bless $self, $class;
222	}
223
224
225	# Initialise the phind classifier
226
227	sub init {
228	my $self = shift (@_);
229
230	# ensure we have a build directory
231	my $builddir = $self->{'builddir'};
232	die unless (-e "$builddir");
233
234	# create phind directory
235	my $phinddir = $self->{'phinddir'};
236	if (-e "$phinddir") {
237	&util::rm_r("$phinddir");
238	}
239	&util::mk_dir("$phinddir");
240
241	# open filehandles for documents and text
242	my $clausefile = &util::filename_cat("$phinddir", "clauses");
243	&util::rm($clausefile) if (-e $clausefile);
244	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
245	$self->{'txthandle'} = TEXT;
246
247	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
248	&util::rm($docfile) if (-e $docfile);
249	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
250	$self->{'dochandle'} = DOCS;
251
252	}
253
254	# Classify each document.
255	#
256	# Each document is passed here in turn. The classifier extracts the
257	# text of each and stores it in the clauses file. Document details are
258	# stored in the docs.txt file.
259
260	sub classify {
261	my $self = shift (@_);
262	my ($doc_obj) = @_;
263
264	my $verbosity = $self->{'verbosity'};
265	my $top_section = $doc_obj->get_top_section();
266
267	my $titlefield = $self->{'titlefield'};
268
269	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
270	print "process: $title\n" if ($verbosity > 2);
271
272	# Only consider the file if it is in the correct language
273	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
274	my $phrlanguage = $self->{'language_exp'};
275	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
276
277	# record this file
278	my $total++;
279	print "file $total: $file\n" if ($self->{'$verbosity'});
280
281
282	# Store document details
283	my $OID = $doc_obj->get_OID();
284	$OID = "NULL" unless defined $OID;
285	my $dochandle = $self->{'dochandle'};
286	print $dochandle "<Document>\t$OID\t$title\n";
287
288	# Store the text occuring in this object
289
290	# output the document delimiter
291	my $txthandle = $self->{'txthandle'};
292	print $txthandle "$doclimit\n";
293
294	# iterarate over the required indexes and store their text
295	my $indexes = $self->{'indexes'};
296	my $text = "";
297	my ($part, $level, $field, $section, $data);
298
299	foreach $part (split(/,/, $indexes)) {
300
301	# Each field has a level and a data element ((e.g. document:Title)
302	($level, $field) = split(/:/, $part);
303	die unless ($level && $field);
304
305	# Extract the text from every section
306	# (In phind, document:text and section:text are equivalent)
307	if ($field eq "text") {
308	$data = "";
309	$section = $doc_obj->get_top_section();
310	while (defined($section)) {
311	$data .= $doc_obj->get_text($section) . "\n";
312	$section = $doc_obj->get_next_section($section);
313	}
314	$text .= convert_gml_to_tokens($data) . "\n";
315	}
316
317	# Extract a metadata field from a document
318	elsif ($level eq "document") {
319	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
320	$text .= convert_gml_to_tokens($data) . "\n";
321	}
322
323	# Extract metadata from every section in a document
324	elsif ($level eq "section") {
325	$data = "";
326	$section = $doc_obj->get_top_section();
327	while (defined($section)) {
328	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
329	$section = $doc_obj->get_next_section($section);
330	}
331	$text .= convert_gml_to_tokens($data) . "\n";
332	}
333
334	# Some sort of specification which I don't understand
335	else {
336	die "Unknown level ($level) in phind index ($part)\n";
337	}
338
339	}
340
341	# output the text
342	$text =~ tr/\n//s;
343	print $txthandle "$text";
344
345	}
346
347
348
349	# Construct the classifier from the information already gathered
350	#
351	# When get_classify_info is called, the clauses and docs.txt files have
352	# already been constructed in the phind directory. This function will
353	# translate them into compressed, indexed MGPP files that can be read by
354	# the phindcgi script. It will also register our classifier so that it
355	# shows up in the navigation bar.
356
357	sub get_classify_info {
358	my $self = shift (@_);
359
360	my $verbosity = $self->{'verbosity'};
361	my $out = $self->{'outhandle'};
362	my $phinddir = $self->{'phinddir'};
363
364	if ($verbosity) {
365	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
366	}
367
368	# Construct phind indexes
369	my $suffixmode = $self->{'suffixmode'};
370	my ($command, $status);
371
372	# Generate the vocabulary, symbol statistics, and numbers file
373	# from the clauses file
374	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
375	&extract_vocabulary($self);
376
377	# Use the suffix program to generate the phind/phrases file
378	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
379	&execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
380
381	# Create the phrase file and put phrase numbers in phind/phrases
382	print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
383	&renumber_phrases($self);
384
385	# Create the mg phrase database
386	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
387	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
388	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
389
390	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
391	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
392	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
393	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
394
395	print $out "\nCreating phrase databases\n";
396	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
397	my $mg_stem = "pdata";
398
399	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
400	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
401	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
402
403	# create the mg index of words
404	print $out "\nCreating word-level search indexes\n";
405	$mg_input = &util::filename_cat($phinddir, "pword.txt");
406	$mg_stem = "pword";
407
408	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
409	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
410	&execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
411	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
412	&execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
413	&execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
414
415	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
416	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
417	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
418
419	# create the mg document information database
420	print $out "\nCreating document information databases\n";
421	$mg_input = &util::filename_cat($phinddir, "docs.txt");
422	$mg_stem = "docs";
423
424	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
425	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
426	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
427
428
429	# Tidy up stray files
430	if (!$self->{'untidy'}) {
431	print $out "\nCleaning up\n" if ($verbosity > 2);
432	&util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
433	"$phinddir/clauses.vocab", "$phinddir/clauses.stats",
434	"$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
435	"$phinddir/pdata.txt", "$phinddir/pword.txt");
436	my $outfile = 1;
437	while (-e "$phinddir/outPhrase.$outfile") {
438	&util::rm("$phinddir/outPhrase.$outfile");
439	$outfile++;
440	}
441	}
442
443
444	# Insert the classifier into.... what?
445	my $collection = $self->{'collection'};
446	my $url = "library?a=p&p=phind&c=$collection";
447
448	my %classifyinfo = ('thistype'=>'Invisible',
449	'childtype'=>'Phind',
450	'Title'=>$self->{'buttonname'},
451	'contains'=>[]);
452
453	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
454	return \%classifyinfo;
455	}
456
457
458
459	sub convert_gml_to_tokens {
460
461	$_ = shift @_;
462
463	# FIRST, remove GML tags
464
465	# Replace all whitespace with a simple space
466	s/\s+/ /gs;
467
468	# Remove everything that is in a tag
469	s/\s<p>\s/ PARAGRAPHBREAK /isg;
470	s/\s<br>\s/ LINEBREAK /isg;
471	s/<[^>]*>/ /sg;
472
473	# Now we have the text, but it may contain HTML
474	# elements coded as > etc. Remove these tags.
475	s/</</sg;
476	s/>/>/sg;
477
478	s/\s+/ /sg;
479	s/\s<p>\s/ PARAGRAPHBREAK /isg;
480	s/\s<br>\s/ LINEBREAK /isg;
481	s/<[^>]*>/ /sg;
482
483	# remove & and other miscellaneous markup tags
484	s/&/&/sg;
485	s/</</sg;
486	s/>/>/sg;
487	s/&/&/sg;
488
489	# replace<p> and <br> placeholders with carriage returns
490	s/PARAGRAPHBREAK/\n/sg;
491	s/LINEBREAK/\n/sg;
492
493
494	# Exceptional punctuation
495	#
496	# We make special cases of some punctuation
497
498	# remove any apostrophe that indicates omitted letters
499	s/(\w+)\'(\w*\s)/ $1$2 /g;
500
501	# remove period that appears in a person's initals
502	s/\s([A-Z])\./ $1 /g;
503
504	# replace hyphens in hypheanted words and names with a space
505	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
506
507
508	# Convert the remaining text to "clause format",
509	# This means removing all excess punctuation and garbage text,
510	# normalising valid punctuation to fullstops and commas,
511	# then putting one cluse on each line.
512
513	# Insert newline when the end of a sentence is detected
514	# (delimter is: "[\.\?\!]\s")
515	s/\s*[\.\?\!]\s+/\n/g;
516
517	# split numbers after four digits
518	s/(\d\d\d\d)/$1 /g;
519
520	# split words after 32 characters
521
522	# squash repeated punctuation
523	tr/A-Za-z0-9 //cs;
524
525	# save email addresses
526	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
527
528	# normalise clause breaks (mostly punctuation symbols) to commas
529	s/[^A-Za-z0-9 \n]+/ , /g;
530
531	# Remove repeated commas, and replace with newline
532	s/\s*,[, ]+/\n/g;
533
534	# remove extra whitespace
535	s/ +/ /sg;
536	s/^\s+//mg;
537	s/\s*$/\n/mg;
538
539	# remove lines that contain one word or less
540	s/^\w*$//mg;
541	s/^\s*$//mg;
542	tr/\n//s;
543
544	return $_;
545	}
546
547
548	# Execute a system command
549
550	sub execute {
551	my ($command, $verbosity, $outhandle) = @_;
552	print $outhandle "Executing: $command\n" if ($verbosity > 2);
553	my $status = system($command);
554	if ($status != 0) {
555	print STDERR "phind - Error executing $command: $!\n";
556	exit($status);
557	}
558	}
559
560
561	# Generate the vocabulary, symbol statistics, and numbers file from the
562	# clauses file. This is legacy code, so is a bit messy and probably wont
563	# run under windows.
564
565	sub extract_vocabulary {
566	my ($self) = @_;
567
568	my $verbosity = $self->{'verbosity'};
569	my $out = $self->{'outhandle'};
570
571	my $collectiondir = $self->{'collectiondir'};
572	my $phinddir = $self->{'phinddir'};
573
574	my $language_exp = $self->{'language_exp'};
575
576	my ($w, $l, $line, $word);
577
578	my ($first_delimiter, $last_delimiter,
579	$first_stopword, $last_stopword,
580	$first_extractword, $last_extractword,
581	$first_contentword, $last_contentword,
582	$phrasedelimiter);
583
584	my $thesaurus = $self->{'thesaurus'};
585	my ($thesaurus_links, $thesaurus_terms,
586	%thesaurus, $first_thesaurusword, $last_thesaurusword);
587
588	my %symbol;
589	my (%freq);
590
591	print $out "Calculating vocabulary\n" if ($verbosity > 1);
592
593	# Read and store the stopwords
594	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
595	my $stopword_files = ();
596	my ($language, $language_dir, $file, $file_name);
597	my %stopwords;
598
599	# Examine each directory in the stopword directory
600	opendir(STOPDIR, $stopdir);
601	foreach $language (readdir STOPDIR) {
602
603	# Ignore entries that do not match the classifier's language
604	next unless ($language =~ /$language_exp/);
605	$language_dir = &util::filename_cat($stopdir, $language);
606	next unless (-d "$language_dir");
607
608	opendir(LANGDIR, $language_dir);
609	foreach $file (readdir LANGDIR) {
610
611	# Ignore entries that are not stopword files
612	next unless ($file =~ /sw$/);
613	$file_name = &util::filename_cat($language_dir, $file);
614	next unless (-f "$file_name");
615
616	# Read the stopwords
617	open(STOPFILE, "<$file_name");
618	while (<STOPFILE>) {
619	s/^\s+//;
620	s/\s.*//;
621	$word = $_;
622	$l = lc($word);
623	$stopwords{$l} = $word;
624	}
625	close STOPFILE;
626
627	}
628	}
629
630	# Read thesaurus information
631	if ($thesaurus) {
632
633	# Ensure both link and term files exist
634	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
635	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
636	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
637	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
638
639	# Read the thesaurus terms
640	open(TH, "<$thesaurus_terms");
641	while(<TH>) {
642	s/^\d+ //;
643	s/\(.*\)//;
644	foreach $w (split(/\s+/, $_)) {
645	$thesaurus{lc($w)} = $w;
646	}
647	}
648	close TH;
649	}
650
651	# Read words in the text and count occurences
652	open(TXT, "<$phinddir/clauses");
653	my @words;
654
655	while(<TXT>) {
656	$line = $_;
657	next unless ($line =~ /./);
658
659	@words = split(/\s+/, $line);
660	foreach $w (@words) {
661	$l = lc($w);
662	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
663	$freq{$w}++;
664	}
665	$freq{$senlimit}++;
666	}
667
668	# Calculate the "best" form of each word
669	my (%bestform, %totalfreq, %bestfreq);
670
671	foreach $w (sort (keys %freq)) {
672	$l = lc($w);
673
674	# totalfreq is the number of times a term appears in any form
675	$totalfreq{$l} += $freq{$w};
676
677	if (defined $stopwords{$l}) {
678	$bestform{$l} = $stopwords{$l};
679
680	} elsif (defined $thesaurus{$l}) {
681	$bestform{$l} = $thesaurus{$l};
682
683	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
684	$bestfreq{$l} = $freq{$w};
685	$bestform{$l} = $w;
686	}
687	}
688
689	undef %freq;
690	undef %bestfreq;
691
692
693	# Assign symbol numbers to tokens
694	my $nextsymbol = 1;
695	my (@vocab);
696
697	# Delimiters
698	$first_delimiter = 1;
699
700	foreach $word (@delimiters) {
701
702	$word = lc($word);
703	$bestform{$word} = uc($word);
704	$vocab[$nextsymbol] = $word;
705	$symbol{$word} = $nextsymbol;
706	$nextsymbol++;
707	}
708	$last_delimiter = $nextsymbol - 1;
709
710	# Stopwords
711	$first_stopword = $nextsymbol;
712
713	foreach my $word (sort keys %stopwords) {
714
715	# don't incluse stopword unless it occurs in the text
716	$word = lc($word);
717	next unless ($totalfreq{$word});
718	next if ($symbol{$word});
719
720	$vocab[$nextsymbol] = $word;
721	$symbol{$word} = $nextsymbol;
722	$nextsymbol++;
723	}
724	$last_stopword = $nextsymbol - 1;
725	$first_contentword = $nextsymbol;
726
727	# Thesaurus terms
728	if ($thesaurus) {
729	$first_thesaurusword = $nextsymbol;
730
731	foreach my $word (sort keys %thesaurus) {
732
733	$word = lc($word);
734	next if ($symbol{$word});
735	$bestform{$word} = $thesaurus{$word};
736
737	$vocab[$nextsymbol] = $word;
738	$symbol{$word} = $nextsymbol;
739	$nextsymbol++;
740
741	}
742	$last_thesaurusword = $nextsymbol - 1;
743	}
744
745	# Other content words
746	$first_extractword = $nextsymbol;
747
748	foreach my $word (sort (keys %bestform)) {
749
750	next if ($symbol{$word});
751
752	$vocab[$nextsymbol] = $word;
753	$symbol{$word} = $nextsymbol;
754	$nextsymbol++;
755	}
756	$last_extractword = $nextsymbol - 1;
757	$last_contentword = $nextsymbol - 1;
758
759
760	# Outut the words
761	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
762	open(VOC, ">$phinddir/clauses.vocab");
763
764	for (my $i = 1; $i < $nextsymbol; $i++) {
765	$w = $vocab[$i];
766
767	print VOC "$bestform{$w}\n";
768	$totalfreq{$w} = 0 unless ($totalfreq{$w});
769	}
770	close VOC;
771
772
773	# Create statistics file
774	# Output statistics about the vocablary
775	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
776	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
777
778	open(STAT, ">$phinddir/clauses.stats")
779	\|\| die "Cannot open $phinddir/clauses.stats: $!";
780
781	print STAT "first_delimiter $first_delimiter\n";
782	print STAT "last_delimiter $last_delimiter\n";
783	print STAT "first_stopword $first_stopword\n";
784	print STAT "last_stopword $last_stopword\n";
785	if ($thesaurus) {
786	print STAT "first_thesaurusword $first_thesaurusword\n";
787	print STAT "last_thesaurusword $last_thesaurusword\n";
788	}
789	print STAT "first_extractword $first_extractword\n";
790	print STAT "last_extractword $last_extractword\n";
791	print STAT "first_contentword $first_contentword\n";
792	print STAT "last_contentword $last_contentword\n";
793	print STAT "first_symbol $first_delimiter\n";
794	print STAT "last_symbol $last_contentword\n";
795	print STAT "first_word $first_stopword\n";
796	print STAT "last_word $last_contentword\n";
797	close STAT;
798
799	undef @vocab;
800
801
802	# Create numbers file
803	# Save text as symbol numbers
804	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
805
806	open(TXT, "<$phinddir/clauses");
807	open(NUM, ">$phinddir/clauses.numbers");
808
809	$phrasedelimiter = $symbol{lc($senlimit)};
810	print NUM "$symbol{lc($colstart)}\n";
811
812	# set up the special symbols that delimit documents and sentences
813	while(<TXT>) {
814
815	# split sentence into a list of tokens
816	$line = $_;
817	next unless ($line =~ /./);
818	@words = split(/\s+/, $line);
819
820	# output one token at a time
821	foreach $word (@words) {
822	$word = lc($word);
823	print NUM "$symbol{$word}\n";
824	}
825
826	# output phrase delimiter
827	print NUM "$phrasedelimiter\n";
828	}
829
830	print NUM "$symbol{lc($colend)}\n";
831	close NUM;
832
833	# Save thesaurus data in one convienient file
834	if ($thesaurus) {
835
836	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
837
838
839	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
840	if ($verbosity > 1);
841
842	# Read the thesaurus terms
843	my ($num, $text, %thes_symbols);
844
845	open(TH, "<$thesaurus_terms");
846	while(<TH>) {
847	chomp;
848	@words = split(/\s+/, $_);
849	$num = shift @words;
850	$text = "";
851
852	# translate words into symbol numbers
853	foreach $word (@words) {
854	$word = lc($word);
855	if ($symbol{$word}) {
856	$text .= "s$symbol{$word} ";
857	} elsif ($verbosity) {
858	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
859	}
860	}
861	$text =~ s/ $//;
862	$thes_symbols{$num} = $text;
863	}
864	close TH;
865
866	# Read the thesaurus links and write the corresponding data
867	open(TH, "<$thesaurus_links");
868	open(THOUT, ">$thesaurusfile");
869
870	while(<TH>) {
871	chomp;
872	($num, $text) = split(/:/, $_);
873
874	if (defined($thes_symbols{$num})) {
875	print THOUT "$num:$thes_symbols{$num}:$text\n";
876	} else {
877	print THOUT "$num:untranslated:$text\n";
878	}
879	}
880	close TH;
881	close THOUT;
882	}
883
884
885
886
887	}
888
889
890	# renumber_phrases
891	#
892	# Prepare the phrases file to be input to mgpp. The biggest problem is
893	# reconciling the phrase identifiers used by the suffix program (which
894	# we'll call suffix-id numbers) with the numbers used in the thesaurus
895	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
896	# Phind-id numbers must be sorted by frequency of occurance.
897	#
898	# Start creating a set of phind-id numbers from the sorted suffix-id
899	# numbers and (if required) the thesaurus-id numbers. Then add any other
900	# phrases occuring in the thesaurus.
901	#
902	# The last thing we have to do is restore the vocabulary information to the
903	# phrase file so that the phrases are stored as words, not as symbol
904	# numbers.
905
906	# The original phrases file looks something like this:
907	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
908	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
909	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
910	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
911
912
913	sub renumber_phrases {
914	my ($self) = @_;
915
916	renumber_suffix_data($self);
917	renumber_thesaurus_data($self);
918	restore_vocabulary_data($self);
919
920	}
921
922
923
924	# renumber_suffix_data
925	#
926	# Translate phrases file to phrases.2 using phind keys instead
927	# of suffix keys and sorting the expansion data.
928
929	sub renumber_suffix_data {
930	my ($self) = @_;
931
932	my $verbosity = $self->{'verbosity'};
933	my $out = $self->{'outhandle'};
934	print $out "Translate phrases: suffix-ids become phind-id's\n"
935	if ($verbosity);
936
937	my $phinddir = $self->{'phinddir'};
938	my $infile = &util::filename_cat($phinddir, 'phrases');
939	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
940
941	# Read the phrase file. Calculate initial set of phind-id
942	# numbers and store (suffixid -> frequency) relation.
943
944	my %suffixtophind;
945	my @phindfrequency;
946	my (@fields, $suffixid);
947	my $nextphind = 1;
948
949	open(IN, "<$infile");
950	while(<IN>) {
951
952	chomp;
953	@fields = split(/:/, $_);
954
955	# get next suffixid and phindid
956	$suffixid = shift @fields;
957	$suffixtophind{$suffixid} = $nextphind;
958
959	# store total frequency
960	shift @fields;
961	$totalfrequency[$nextphind] = shift @fields;
962
963	$nextphind++;
964	}
965	close IN;
966
967
968	# Translate phrases file to phrases.2. Use phind keys (not suffix
969	# keys), sort expansion and document occurance data in order of
970	# descending frequency..
971	open(IN, "<$infile");
972	open(OUT, ">$outfile");
973
974	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
975	my (@documwents, @newexp, $k, $n);
976	my $linenumber = 0;
977
978	while(<IN>) {
979
980	# read the line
981	chomp;
982	@fields = split(/:/, $_);
983
984	# get a phrase number for this line
985	$suffixid = shift @fields;
986	die unless (defined($suffixtophind{$suffixid}));
987	$phindid = $suffixtophind{$suffixid};
988
989	# get the symbols in the phrase
990	$text = shift @fields;
991
992	# output status information
993	$linenumber++;
994	if ($verbosity > 2) {
995	if ($linenumber % 1000 == 0) {
996	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
997	}
998	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
999	}
1000
1001	# get the phrase frequency
1002	$tf = shift @fields;
1003
1004	# get the number of expansions
1005	$countexp = shift @fields;
1006
1007	# get the expansions, convert them into phind-id numbers, and sort them
1008	$expansions = shift @fields;
1009	@newexp = ();
1010	foreach $k (split(/,/, $expansions)) {
1011	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1012	$n = $suffixtophind{$k};
1013	push @newexp, $n;
1014	}
1015	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1016
1017	# get the number of documents
1018	$countdocs = shift @fields;
1019
1020	# get the documents and sort them
1021	$documents = shift @fields;
1022	$documents =~ s/d//g;
1023	@documents = split(/;/, $documents);
1024	@documents = sort by_doc_frequency @documents;
1025
1026	# output the phrase data
1027	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1028	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1029
1030	}
1031
1032	close IN;
1033	close OUT;
1034	}
1035
1036
1037	# renumber_thesaurus_data
1038	#
1039	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1040
1041	sub renumber_thesaurus_data {
1042	my ($self) = @_;
1043
1044	my $out = $self->{'outhandle'};
1045	my $verbosity = $self->{'verbosity'};
1046	my $thesaurus = $self->{'thesaurus'};
1047
1048	my $phinddir = $self->{'phinddir'};
1049	my $infile = &util::filename_cat($phinddir, "phrases.2");
1050	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1051
1052
1053	# If no thesaurus is defined, simply move the phrases file.
1054	if (!$thesaurus) {
1055	print $out "Translate phrases.2: no thesaurus data\n"
1056	if ($verbosity);
1057	&util::mv($infile, $outfile);
1058	return;
1059	}
1060
1061	print $out "Translate phrases.2: add thesaurus data\n"
1062	if ($verbosity);
1063
1064	# 1.
1065	# Read thesaurus file and store (symbols->thesaurusid) mapping
1066	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1067	my %symbolstothesid;
1068	my (@fields, $thesid, $symbols);
1069
1070	open(TH, "<$thesaurusfile");
1071
1072	while (<TH>) {
1073
1074	chomp;
1075	@fields = split(/:/, $_);
1076
1077	# get id and text
1078	$thesid = shift @fields;
1079	$symbols = shift @fields;
1080	$symbolstothesid{$symbols} = $thesid;
1081	}
1082	close TH;
1083
1084	# 2.
1085	# Read phrases file to find thesaurus entries that already
1086	# have a phindid. Store their phind-ids for later translation.
1087	my %thesaurustophindid;
1088	my ($phindid);
1089
1090	open(IN, "<$infile");
1091
1092	while(<IN>) {
1093
1094	chomp;
1095	@fields = split(/:/, $_);
1096
1097	# phindid and symbols for this line
1098	$phindid = shift @fields;
1099	$symbols = shift @fields;
1100
1101	# do we have a thesaurus id corresponding to this phrase?
1102	if (defined($symbolstothesid{$symbols})) {
1103	$thesid = $symbolstothesid{$symbols};
1104	$thesaurustophindid{$thesid} = $phindid;
1105	}
1106	}
1107	close IN;
1108
1109	undef %symbolstothesid;
1110
1111	# 3.
1112	# Create phind-id numbers for remaining thesaurus entries
1113	my $nextphindid = $phindid + 1;
1114
1115	open(TH, "<$thesaurusfile");
1116	while(<TH>) {
1117
1118	chomp;
1119	@fields = split(/:/, $_);
1120
1121	# read thesaurus-id and ensure it has a corresponding phind-id
1122	$thesid = shift @fields;
1123	if (!defined($thesaurustophindid{$thesid})) {
1124	$thesaurustophindid{$thesid} = $nextphindid;
1125	$nextphindid++;
1126	}
1127	}
1128	close TH;
1129
1130	# 4.
1131	# Translate thesaurus file, replacing thesaurus-id numbers with
1132	# phind-id numbers.
1133	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1134	my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1135
1136	open(TH, "<$thesaurusfile");
1137	open(TO, ">$newthesaurusfile");
1138	while(<TH>) {
1139
1140	chomp;
1141	@fields = split(/:/, $_);
1142
1143	# phindid and symbols for this line
1144	($thesid, $symbols, $relations) = @fields;
1145
1146	die unless ($thesid && $symbols);
1147	die unless $thesaurustophindid{$thesid};
1148	$phindid = $thesaurustophindid{$thesid};
1149
1150	# convert each part of the relation string to use phind-id numbers
1151	$newrelation = "";
1152	$linkcounter = 0;
1153	foreach $linktext (split(/;/, $relations)) {
1154	@linkdata = split(/,/, $linktext);
1155
1156	# remember the linktype (e.g. BT, NT)
1157	$linktype = shift @linkdata;
1158	$newrelation .= "$linktype,";
1159
1160	# convert the link target identfiers
1161	foreach $link (@linkdata) {
1162	die unless (defined($thesaurustophindid{$link}));
1163	$newrelation .= "$thesaurustophindid{$link},";
1164	$linkcounter++;
1165	}
1166	$newrelation =~ s/\,$//;
1167	$newrelation .= ";";
1168	}
1169	$newrelation .= ":";
1170
1171	print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1172	}
1173	close TH;
1174	close TO;
1175
1176	undef %thesaurustophindid;
1177
1178	# 5.
1179	# Read thesaurus data (in phind-id format) into memory
1180	my %thesaurusdata;
1181
1182	open(TH, "<$newthesaurusfile");
1183	while(<TH>) {
1184	chomp;
1185	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1186	die unless ($phindid && $symbols);
1187	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1188	}
1189
1190	# 6.
1191	# Add thesaurus data to phrases file
1192	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1193	my (@documwents, @newexp, $k, $n);
1194	my $linenumber = 0;
1195
1196	open(IN, "<$infile");
1197	open(OUT, ">$outfile");
1198
1199	# Update existing phrases
1200	while(<IN>) {
1201
1202	chomp;
1203	@fields = split(/:/, $_);
1204
1205	# get data for this line
1206	$phindid = shift @fields;
1207
1208	# output the phrase data, with thesaurus information
1209	print OUT "$phindid:", join(":", @fields);
1210
1211	# add thesaurus data
1212	if (defined($thesaurusdata{$phindid})) {
1213	@fields = split(/:/, $thesaurusdata{$phindid});
1214	shift @fields;
1215	$linkcounter = shift @fields;
1216	$relations = shift @fields;
1217
1218	print OUT ":$linkcounter:$relations";
1219	$thesaurusdata{$phindid} = "";
1220	}
1221	print OUT "\n";
1222	}
1223	close IN;
1224
1225	# Add phrases that aren't already in the file
1226	foreach $phindid (sort numerically keys %thesaurusdata) {
1227	next unless ($thesaurusdata{$phindid});
1228
1229	@fields = split(/:/, $thesaurusdata{$phindid});
1230	$symbols = shift @fields;
1231	$linkcounter = shift @fields;
1232	$relations = shift @fields;
1233
1234	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1235	}
1236	close OUT;
1237
1238	}
1239
1240	# restore_vocabulary_data
1241	#
1242	# Read phrases.3 and restore vocabulary information. Then write
1243	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1244	# (if requested) to the saved phrases file.
1245
1246	sub restore_vocabulary_data {
1247	my ($self) = @_;
1248
1249	my $out = $self->{'outhandle'};
1250	my $verbosity = $self->{'verbosity'};
1251	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1252
1253	my $phinddir = $self->{'phinddir'};
1254	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1255	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1256	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1257	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1258
1259	my $savephrases = $self->{'savephrases'};
1260
1261	# 1.
1262	# Read the vocabulary file
1263	open(V, "<$vocabfile")
1264	\|\| die "Cannot open $vocabfile: $!";
1265	my @symbol;
1266	my $i = 1;
1267	while(<V>) {
1268	chomp;
1269	$symbol[$i++] = $_;
1270	}
1271
1272
1273	# 2.
1274	# Translate phrases.3 to MGPP input files
1275	my ($key, $text, $word);
1276	my @fields;
1277	my $linenumber = 0;
1278
1279	open(IN, "<$infile");
1280	open(DATA, ">$datafile");
1281	open(WORD, ">$wordfile");
1282
1283	# Save the phrases in a separate text file
1284	if ($savephrases) {
1285	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1286	open(SAVE, ">$savephrases");
1287	}
1288
1289	while(<IN>) {
1290
1291	# read the line
1292	chomp;
1293	$line = $_;
1294	@fields = split(/:/, $line);
1295
1296	# get a phrase number for this line
1297	$key = shift @fields;
1298
1299	# restore the text of the phrase
1300	$text = shift @fields;
1301	$text =~ s/s(\d+)/$symbol[$1]/g;
1302	if ($text =~ / /) {
1303	$word = "";
1304	} elsif ($text ne 'untranslated') {
1305	$word = $text;
1306	}
1307
1308	# output the phrase data
1309	print DATA "<Document>";
1310	print DATA "$key:$text:", join(":", @fields), ":\n";
1311
1312	# output the word index search data
1313	print WORD "<Document>$word\n";
1314
1315	# output the phrases to a text file
1316	if ($savephrases) {
1317	print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1318	}
1319	}
1320	close IN;
1321	close WORD;
1322	close DATA;
1323	close SAVE if ($savephrases);
1324
1325	}
1326
1327
1328
1329	# sort routines used to renumber phrases
1330
1331	sub numerically { $a <=> $b }
1332
1333	sub by_doc_frequency {
1334	my $fa = 1;
1335	if ($a =~ /,/) {
1336	$fa = $a;
1337	$fa =~ s/\d+,//;
1338	}
1339	my $fb = 1;
1340	if ($b =~ /,/) {
1341	$fb = $b;
1342	$fb =~ s/\d+,//;
1343	}
1344
1345	return ($fb <=> $fa);
1346	}
1347
1348	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: