Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 1890

Last change on this file since 1890 was 1890, checked in by paynter, 23 years ago
When multiple metadata fields have multiple values, get them all. Initial (poor) support for multiple languages (will have to replace). Some documentation removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 36.2 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl phind" at the command line for a summary.
32
33	package phind;
34
35	use BasClas;
36	use util;
37
38
39	sub BEGIN {
40	@ISA = ('BasClas');
41	}
42
43
44	sub print_usage {
45	print STDERR "
46	usage: classify phind [options]
47
48	options:
49	-text Fields The text used to build the phrase hierarchy.
50	(default: 'section:Title,section:text')
51
52	-title Title The metadata field used to describe each document.
53	(default: 'Title')
54
55	-button Name The label for the classifier screen and button in
56	navigation bar.
57	(default: 'Phrase')
58
59	-language Regex Language or languages to use building hierarchy.
60	Languages are identified by two-letter country codes
61	like en (English), es (Spanish), and fr (French).
62	Language is a regular expression, so 'en\|fr' (English or
63	French) and '..' (match any language) are valid.
64	(default: 'en'.)
65
66	-savephrases File If set, the phrase infomation will be stored in
67	the given file as text. It is probably a good idea
68	to use an absolute path.
69	(defualt: not set)
70
71	-suffixmode N The smode parameter to the phrase extraction program. A
72	value of 0 means that stopwords are ignored, and of 1
73	means that stopwords are used.
74	(default: 1)
75
76	-thesaurus Name Name of a thesaurus stored in phind format in the
77	collection's etc directory.
78	(default: not set)
79
80	-untidy Don't remove working files.
81
82	"; }
83
84
85	# Phrase delimiter symbols - these should be abstracted out someplace
86
87	my $colstart = "COLLECTIONSTART";
88	my $colend = "COLLECTIONEND";
89	my $doclimit = "DOCUMENTLIMIT";
90	my $senlimit = "SENTENCELIMIT";
91	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
92
93
94	# Create a new phind browser based on collect.cfg
95
96	sub new {
97	my $class = shift (@_);
98	my $self = new BasClas($class, @_);
99
100	my $out = $self->{'outhandle'};
101
102
103	# Phind installation check
104	# The phind phrase browser is research software and is not installed
105	# by defualt. If the user attepts to use it we warn them that it's a
106	# bit dodgy, then tell them how to install it. If they can do that
107	# and get all the files in place, then we let them proceed.
108
109	print $out "Checking Phind phrase browser requirements...\n";
110
111	# Make sure we're not in windows
112	if ($ENV{'GSDLOS'} =~ /windows/i) {
113	print STDERR "Sorry - Phind currently only works under Unix";
114	exit(1);
115	}
116
117	# Ensure the Phind generate scripts are in place
118	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
119	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
120
121	if (!(-e $file1)) {
122	print STDERR "The phind \"suffix\" program is not installed. ";
123	print STDERR "To install it, change to the directory\n";
124	print STDERR " $src\n";
125	print STDERR "and type \"make install-phind\".\n\n";
126	exit(1);
127	}
128
129	# Ensure the Phind CGI script is in place
130	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
131	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
132
133	if (!(-e $file1)) {
134	print STDERR "The phind CGI program is not installed. ";
135	print STDERR "To install it, change to the directory\n";
136	print STDERR " $src\n";
137	print STDERR "and type \"make install-phind\".\n\n";
138	exit(1);
139	}
140
141	# Ensure the Phind Java applet is in place
142	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
143	$file1 = &util::filename_cat($src, "Phind.class");
144
145	if (!(-e $file1)) {
146	print STDERR "The phind Java classes are not compiled. ";
147	print STDERR "To compile them, change to the directory\n";
148	print STDERR " $src\n";
149	print STDERR "and use your Java compiler to compile Phind.java.\n";
150	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
151	exit(1);
152	}
153
154	# Parse classifier arguments
155	my $builddir = "";
156	my $phinddir = "";
157	if (!parsargv::parse(\@_,
158	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
159	q^title/.*/Title^, \$self->{'titlefield'},
160	q^button/.*/Phrase^, \$self->{'buttonname'},
161	q^language/.*/en^, \$language,
162	q^builddir/.*/^, \$builddir,
163	q^savephrases/\d/0^, \$self->{'savephrases'},
164	q^suffixmode/\d/1^, \$self->{'suffixmode'},
165	q^thesaurus/.*/^, \$self->{'thesaurus'},
166	q^untidy^, \$self->{'untidy'},
167	"allow_extra_options")) {
168
169	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
170	&print_usage();
171	die "\n";
172	}
173
174	# classifier information
175	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
176
177	# limit languages
178	$self->{'language_exp'} = $language;
179
180	# collection directories
181	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
182	if (!$builddir) {
183	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
184	}
185	$self->{'builddir'} = $builddir;
186	$self->{'phinddir'} = &util::filename_cat($builddir, "phind");
187
188	return bless $self, $class;
189	}
190
191
192	# Initialise the phind classifier
193
194	sub init {
195	my $self = shift (@_);
196
197	# ensure we have a build directory
198	my $builddir = $self->{'builddir'};
199	die unless (-e "$builddir");
200
201	# create phind directory
202	my $phinddir = $self->{'phinddir'};
203	if (-e "$phinddir") {
204	&util::rm_r("$phinddir");
205	}
206	&util::mk_dir("$phinddir");
207
208	# open filehandles for documents and text
209	my $clausefile = &util::filename_cat("$phinddir", "clauses");
210	&util::rm($clausefile) if (-e $clausefile);
211	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
212	$self->{'txthandle'} = TEXT;
213
214	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
215	&util::rm($docfile) if (-e $docfile);
216	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
217	$self->{'dochandle'} = DOCS;
218
219	}
220
221
222	# Classify each document.
223	#
224	# Each document is passed here in turn. The classifier extracts the
225	# text of each and stores it in the clauses file. Document details are
226	# stored in the docs.txt file.
227
228	sub classify {
229	my $self = shift (@_);
230	my ($doc_obj) = @_;
231
232	my $verbosity = $self->{'verbosity'};
233	my $top_section = $doc_obj->get_top_section();
234
235	my $titlefield = $self->{'titlefield'};
236
237	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
238	print "process: $title\n" if ($verbosity > 2);
239
240	# Only consider the file if it is in the correct language
241	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
242	my $phrlanguage = $self->{'language_exp'};
243	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
244
245	# record this file
246	my $total++;
247	print "file $total: $file\n" if ($self->{'$verbosity'});
248
249
250	# Store document details
251	my $OID = $doc_obj->get_OID();
252	$OID = "NULL" unless defined $OID;
253	my $dochandle = $self->{'dochandle'};
254	print $dochandle "<Document>\t$OID\t$title\n";
255
256	# Store the text occuring in this object
257
258	# output the document delimiter
259	my $txthandle = $self->{'txthandle'};
260	print $txthandle "$doclimit\n";
261
262	# iterarate over the required indexes and store their text
263	my $indexes = $self->{'indexes'};
264	my $text = "";
265	my ($part, $level, $field, $section, $data, $dataref);
266
267	foreach $part (split(/,/, $indexes)) {
268
269	# Each field has a level and a data element ((e.g. document:Title)
270	($level, $field) = split(/:/, $part);
271	die unless ($level && $field);
272
273	# Extract the text from every section
274	# (In phind, document:text and section:text are equivalent)
275	if ($field eq "text") {
276	$data = "";
277	$section = $doc_obj->get_top_section();
278	while (defined($section)) {
279	$data .= $doc_obj->get_text($section) . "\n";
280	$section = $doc_obj->get_next_section($section);
281	}
282	$text .= convert_gml_to_tokens($data) . "\n";
283	}
284
285	# Extract a metadata field from a document
286	# (If ther eis more than one element of the given type, get them all.)
287	elsif ($level eq "document") {
288	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
289	foreach $data ($$dataref) {
290	$text .= convert_gml_to_tokens($data) . "\n";
291	}
292	}
293
294	# Extract metadata from every section in a document
295	elsif ($level eq "section") {
296	$data = "";
297	$section = $doc_obj->get_top_section();
298	while (defined($section)) {
299	$dataref .= $doc_obj->get_metadata($section, $field);
300	$data .= join("\n", $$dataref) . "\n";
301	$section = $doc_obj->get_next_section($section);
302	}
303	$text .= convert_gml_to_tokens($data) . "\n";
304	}
305
306	# Some sort of specification which I don't understand
307	else {
308	die "Unknown level ($level) in phind index ($part)\n";
309	}
310
311	}
312
313	# output the text
314	$text =~ tr/\n//s;
315	print $txthandle "$text";
316	}
317
318
319	# Construct the classifier from the information already gathered
320	#
321	# When get_classify_info is called, the clauses and docs.txt files have
322	# already been constructed in the phind directory. This function will
323	# translate them into compressed, indexed MGPP files that can be read by
324	# the phindcgi script. It will also register our classifier so that it
325	# shows up in the navigation bar.
326
327	sub get_classify_info {
328	my $self = shift (@_);
329
330	my $verbosity = $self->{'verbosity'};
331	my $out = $self->{'outhandle'};
332	my $phinddir = $self->{'phinddir'};
333
334	if ($verbosity) {
335	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
336	}
337
338	# Construct phind indexes
339	my $suffixmode = $self->{'suffixmode'};
340	my ($command, $status);
341
342	# Generate the vocabulary, symbol statistics, and numbers file
343	# from the clauses file
344	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
345	&extract_vocabulary($self);
346
347	# Use the suffix program to generate the phind/phrases file
348	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
349	&execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
350
351	# Create the phrase file and put phrase numbers in phind/phrases
352	print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
353	&renumber_phrases($self);
354
355	# Create the mg phrase database
356	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
357	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
358	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
359
360	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
361	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
362	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
363	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
364
365	print $out "\nCreating phrase databases\n";
366	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
367	my $mg_stem = "pdata";
368
369	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
370	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
371	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
372
373	# create the mg index of words
374	print $out "\nCreating word-level search indexes\n";
375	$mg_input = &util::filename_cat($phinddir, "pword.txt");
376	$mg_stem = "pword";
377
378	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
379	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
380	&execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
381	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
382	&execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
383	&execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
384
385	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
386	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
387	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
388
389	# create the mg document information database
390	print $out "\nCreating document information databases\n";
391	$mg_input = &util::filename_cat($phinddir, "docs.txt");
392	$mg_stem = "docs";
393
394	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
395	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
396	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
397
398
399	# Tidy up stray files
400	if (!$self->{'untidy'}) {
401	print $out "\nCleaning up\n" if ($verbosity > 2);
402	&util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
403	"$phinddir/clauses.vocab", "$phinddir/clauses.stats",
404	"$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
405	"$phinddir/pdata.txt", "$phinddir/pword.txt");
406	my $outfile = 1;
407	while (-e "$phinddir/outPhrase.$outfile") {
408	&util::rm("$phinddir/outPhrase.$outfile");
409	$outfile++;
410	}
411	}
412
413
414	# Insert the classifier into.... what?
415	my $collection = $self->{'collection'};
416	my $url = "library?a=p&p=phind&c=$collection";
417
418	my %classifyinfo = ('thistype'=>'Invisible',
419	'childtype'=>'Phind',
420	'Title'=>$self->{'buttonname'},
421	'contains'=>[]);
422
423	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
424	return \%classifyinfo;
425	}
426
427
428
429	sub convert_gml_to_tokens {
430
431	my ($text) = @_;
432	my $language_exp = $self->{'language_exp'};
433
434	# FIRST, remove GML tags
435	$_ = $text;
436
437	# Replace all whitespace with a simple space
438	s/\s+/ /gso;
439
440	# Remove everything that is in a tag
441	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
442	s/\s<br>\s/ LINEBREAK /isgo;
443	s/<[^>]*>/ /sgo;
444
445	# Now we have the text, but it may contain HTML
446	# elements coded as > etc. Remove these tags.
447	s/</</sgo;
448	s/>/>/sgo;
449
450	s/\s+/ /sgo;
451	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
452	s/\s<br>\s/ LINEBREAK /isgo;
453	s/<[^>]*>/ /sgo;
454
455	# remove & and other miscellaneous markup tags
456	s/&/&/sgo;
457	s/</</sgo;
458	s/>/>/sgo;
459	s/&/&/sgo;
460
461	# replace<p> and <br> placeholders with carriage returns
462	s/PARAGRAPHBREAK/\n/sgo;
463	s/LINEBREAK/\n/sgo;
464
465	$text = $_;
466
467
468	# Language-specific word-cleanup
469
470	# English
471	if ($language_exp =~ /en/) {
472
473	# remove any apostrophe that indicates omitted letters
474	$text =~ s/(\w+)\'(\w*\s)/ $1$2 /g;
475
476	# remove period that appears in a person's initals
477	$text =~ s/\s([A-Z])\./ $1 /g;
478
479	# replace hyphens in hyphenated words and names with a space
480	$text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
481
482	}
483
484
485	# Convert the remaining text to "clause format",
486
487	# This means removing all excess punctuation and garbage text,
488	# normalising valid punctuation to fullstops and commas,
489	# then putting one clause on each line.
490
491	# Insert newline when the end of a sentence is detected
492	# (delimter is: "[\.\?\!]\s")
493	$text =~ s/\s*[\.\?\!]\s+/\n/go;
494
495
496	# Language-specific clause clean-up
497
498	# English
499	if ($language_exp =~ /en/) {
500
501	# split numbers after four digits
502	$text =~ s/(\d\d\d\d)/$1 /g;
503
504	# split words after 32 characters
505
506	# squash repeated punctuation
507	$text =~ tr/A-Za-z0-9 //cs;
508
509	# normalise clause breaks (mostly punctuation symbols) to commas
510	$text =~ s/[^A-Za-z0-9 \n]+/ , /g;
511
512	# Remove repeated commas, and replace with newline
513	$text =~ s/\s*,[, ]+/\n/g;
514	}
515
516	# remove extra whitespace
517	$_ = $text;
518	s/ +/ /sgo;
519	s/^\s+//mgo;
520	s/\s*$/\n/mgo;
521
522	# remove lines that contain one word or less
523	s/^\S*$//mgo;
524	s/^\s*$//mgo;
525	tr/\n//s;
526
527	return $_;
528	}
529
530
531	# Execute a system command
532
533	sub execute {
534	my ($command, $verbosity, $outhandle) = @_;
535	print $outhandle "Executing: $command\n" if ($verbosity > 2);
536	my $status = system($command);
537	if ($status != 0) {
538	print STDERR "phind - Error executing $command: $!\n";
539	exit($status);
540	}
541	}
542
543
544	# Generate the vocabulary, symbol statistics, and numbers file from the
545	# clauses file. This is legacy code, so is a bit messy and probably wont
546	# run under windows.
547
548	sub extract_vocabulary {
549	my ($self) = @_;
550
551	my $verbosity = $self->{'verbosity'};
552	my $out = $self->{'outhandle'};
553
554	my $collectiondir = $self->{'collectiondir'};
555	my $phinddir = $self->{'phinddir'};
556
557	my $language_exp = $self->{'language_exp'};
558
559	my ($w, $l, $line, $word);
560
561	my ($first_delimiter, $last_delimiter,
562	$first_stopword, $last_stopword,
563	$first_extractword, $last_extractword,
564	$first_contentword, $last_contentword,
565	$phrasedelimiter);
566
567	my $thesaurus = $self->{'thesaurus'};
568	my ($thesaurus_links, $thesaurus_terms,
569	%thesaurus, $first_thesaurusword, $last_thesaurusword);
570
571	my %symbol;
572	my (%freq);
573
574	print $out "Calculating vocabulary\n" if ($verbosity > 1);
575
576	# Read and store the stopwords
577	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
578	my $stopword_files = ();
579	my ($language, $language_dir, $file, $file_name);
580	my %stopwords;
581
582	# Examine each directory in the stopword directory
583	opendir(STOPDIR, $stopdir);
584	foreach $language (readdir STOPDIR) {
585
586	# Ignore entries that do not match the classifier's language
587	next unless ($language =~ /$language_exp/);
588	$language_dir = &util::filename_cat($stopdir, $language);
589	next unless (-d "$language_dir");
590
591	opendir(LANGDIR, $language_dir);
592	foreach $file (readdir LANGDIR) {
593
594	# Ignore entries that are not stopword files
595	next unless ($file =~ /sw$/);
596	$file_name = &util::filename_cat($language_dir, $file);
597	next unless (-f "$file_name");
598
599	# Read the stopwords
600	open(STOPFILE, "<$file_name");
601	while (<STOPFILE>) {
602	s/^\s+//;
603	s/\s.*//;
604	$word = $_;
605	$l = lc($word);
606	$stopwords{$l} = $word;
607	}
608	close STOPFILE;
609
610	}
611	}
612
613	# Read thesaurus information
614	if ($thesaurus) {
615
616	# Ensure both link and term files exist
617	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
618	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
619	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
620	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
621
622	# Read the thesaurus terms
623	open(TH, "<$thesaurus_terms");
624	while(<TH>) {
625	s/^\d+ //;
626	s/\(.*\)//;
627	foreach $w (split(/\s+/, $_)) {
628	$thesaurus{lc($w)} = $w;
629	}
630	}
631	close TH;
632	}
633
634	# Read words in the text and count occurences
635	open(TXT, "<$phinddir/clauses");
636	my @words;
637
638	while(<TXT>) {
639	$line = $_;
640	next unless ($line =~ /./);
641
642	@words = split(/\s+/, $line);
643	foreach $w (@words) {
644	$l = lc($w);
645	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
646	$freq{$w}++;
647	}
648	$freq{$senlimit}++;
649	}
650
651	# Calculate the "best" form of each word
652	my (%bestform, %totalfreq, %bestfreq);
653
654	foreach $w (sort (keys %freq)) {
655	$l = lc($w);
656
657	# totalfreq is the number of times a term appears in any form
658	$totalfreq{$l} += $freq{$w};
659
660	if (defined $stopwords{$l}) {
661	$bestform{$l} = $stopwords{$l};
662
663	} elsif (defined $thesaurus{$l}) {
664	$bestform{$l} = $thesaurus{$l};
665
666	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
667	$bestfreq{$l} = $freq{$w};
668	$bestform{$l} = $w;
669	}
670	}
671
672	undef %freq;
673	undef %bestfreq;
674
675
676	# Assign symbol numbers to tokens
677	my $nextsymbol = 1;
678	my (@vocab);
679
680	# Delimiters
681	$first_delimiter = 1;
682
683	foreach $word (@delimiters) {
684
685	$word = lc($word);
686	$bestform{$word} = uc($word);
687	$vocab[$nextsymbol] = $word;
688	$symbol{$word} = $nextsymbol;
689	$nextsymbol++;
690	}
691	$last_delimiter = $nextsymbol - 1;
692
693	# Stopwords
694	$first_stopword = $nextsymbol;
695
696	foreach my $word (sort keys %stopwords) {
697
698	# don't incluse stopword unless it occurs in the text
699	$word = lc($word);
700	next unless ($totalfreq{$word});
701	next if ($symbol{$word});
702
703	$vocab[$nextsymbol] = $word;
704	$symbol{$word} = $nextsymbol;
705	$nextsymbol++;
706	}
707	$last_stopword = $nextsymbol - 1;
708	$first_contentword = $nextsymbol;
709
710	# Thesaurus terms
711	if ($thesaurus) {
712	$first_thesaurusword = $nextsymbol;
713
714	foreach my $word (sort keys %thesaurus) {
715
716	$word = lc($word);
717	next if ($symbol{$word});
718	$bestform{$word} = $thesaurus{$word};
719
720	$vocab[$nextsymbol] = $word;
721	$symbol{$word} = $nextsymbol;
722	$nextsymbol++;
723
724	}
725	$last_thesaurusword = $nextsymbol - 1;
726	}
727
728	# Other content words
729	$first_extractword = $nextsymbol;
730
731	foreach my $word (sort (keys %bestform)) {
732
733	next if ($symbol{$word});
734
735	$vocab[$nextsymbol] = $word;
736	$symbol{$word} = $nextsymbol;
737	$nextsymbol++;
738	}
739	$last_extractword = $nextsymbol - 1;
740	$last_contentword = $nextsymbol - 1;
741
742
743	# Outut the words
744	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
745	open(VOC, ">$phinddir/clauses.vocab");
746
747	for (my $i = 1; $i < $nextsymbol; $i++) {
748	$w = $vocab[$i];
749
750	print VOC "$bestform{$w}\n";
751	$totalfreq{$w} = 0 unless ($totalfreq{$w});
752	}
753	close VOC;
754
755
756	# Create statistics file
757	# Output statistics about the vocablary
758	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
759	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
760
761	open(STAT, ">$phinddir/clauses.stats")
762	\|\| die "Cannot open $phinddir/clauses.stats: $!";
763
764	print STAT "first_delimiter $first_delimiter\n";
765	print STAT "last_delimiter $last_delimiter\n";
766	print STAT "first_stopword $first_stopword\n";
767	print STAT "last_stopword $last_stopword\n";
768	if ($thesaurus) {
769	print STAT "first_thesaurusword $first_thesaurusword\n";
770	print STAT "last_thesaurusword $last_thesaurusword\n";
771	}
772	print STAT "first_extractword $first_extractword\n";
773	print STAT "last_extractword $last_extractword\n";
774	print STAT "first_contentword $first_contentword\n";
775	print STAT "last_contentword $last_contentword\n";
776	print STAT "first_symbol $first_delimiter\n";
777	print STAT "last_symbol $last_contentword\n";
778	print STAT "first_word $first_stopword\n";
779	print STAT "last_word $last_contentword\n";
780	close STAT;
781
782	undef @vocab;
783
784
785	# Create numbers file
786	# Save text as symbol numbers
787	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
788
789	open(TXT, "<$phinddir/clauses");
790	open(NUM, ">$phinddir/clauses.numbers");
791
792	$phrasedelimiter = $symbol{lc($senlimit)};
793	print NUM "$symbol{lc($colstart)}\n";
794
795	# set up the special symbols that delimit documents and sentences
796	while(<TXT>) {
797
798	# split sentence into a list of tokens
799	$line = $_;
800	next unless ($line =~ /./);
801	@words = split(/\s+/, $line);
802
803	# output one token at a time
804	foreach $word (@words) {
805	$word = lc($word);
806	print NUM "$symbol{$word}\n";
807	}
808
809	# output phrase delimiter
810	print NUM "$phrasedelimiter\n";
811	}
812
813	print NUM "$symbol{lc($colend)}\n";
814	close NUM;
815
816	# Save thesaurus data in one convienient file
817	if ($thesaurus) {
818
819	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
820
821
822	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
823	if ($verbosity > 1);
824
825	# Read the thesaurus terms
826	my ($num, $text, %thes_symbols);
827
828	open(TH, "<$thesaurus_terms");
829	while(<TH>) {
830	chomp;
831	@words = split(/\s+/, $_);
832	$num = shift @words;
833	$text = "";
834
835	# translate words into symbol numbers
836	foreach $word (@words) {
837	$word = lc($word);
838	if ($symbol{$word}) {
839	$text .= "s$symbol{$word} ";
840	} elsif ($verbosity) {
841	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
842	}
843	}
844	$text =~ s/ $//;
845	$thes_symbols{$num} = $text;
846	}
847	close TH;
848
849	# Read the thesaurus links and write the corresponding data
850	open(TH, "<$thesaurus_links");
851	open(THOUT, ">$thesaurusfile");
852
853	while(<TH>) {
854	chomp;
855	($num, $text) = split(/:/, $_);
856
857	if (defined($thes_symbols{$num})) {
858	print THOUT "$num:$thes_symbols{$num}:$text\n";
859	} else {
860	print THOUT "$num:untranslated:$text\n";
861	}
862	}
863	close TH;
864	close THOUT;
865	}
866
867
868
869
870	}
871
872
873	# renumber_phrases
874	#
875	# Prepare the phrases file to be input to mgpp. The biggest problem is
876	# reconciling the phrase identifiers used by the suffix program (which
877	# we'll call suffix-id numbers) with the numbers used in the thesaurus
878	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
879	# Phind-id numbers must be sorted by frequency of occurance.
880	#
881	# Start creating a set of phind-id numbers from the sorted suffix-id
882	# numbers and (if required) the thesaurus-id numbers. Then add any other
883	# phrases occuring in the thesaurus.
884	#
885	# The last thing we have to do is restore the vocabulary information to the
886	# phrase file so that the phrases are stored as words, not as symbol
887	# numbers.
888
889	# The original phrases file looks something like this:
890	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
891	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
892	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
893	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
894
895
896	sub renumber_phrases {
897	my ($self) = @_;
898
899	renumber_suffix_data($self);
900	renumber_thesaurus_data($self);
901	restore_vocabulary_data($self);
902
903	}
904
905
906
907	# renumber_suffix_data
908	#
909	# Translate phrases file to phrases.2 using phind keys instead
910	# of suffix keys and sorting the expansion data.
911
912	sub renumber_suffix_data {
913	my ($self) = @_;
914
915	my $verbosity = $self->{'verbosity'};
916	my $out = $self->{'outhandle'};
917	print $out "Translate phrases: suffix-ids become phind-id's\n"
918	if ($verbosity);
919
920	my $phinddir = $self->{'phinddir'};
921	my $infile = &util::filename_cat($phinddir, 'phrases');
922	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
923
924	# Read the phrase file. Calculate initial set of phind-id
925	# numbers and store (suffixid -> frequency) relation.
926
927	my %suffixtophind;
928	my @phindfrequency;
929	my (@fields, $suffixid);
930	my $nextphind = 1;
931
932	open(IN, "<$infile");
933	while(<IN>) {
934
935	chomp;
936	@fields = split(/:/, $_);
937
938	# get next suffixid and phindid
939	$suffixid = shift @fields;
940	$suffixtophind{$suffixid} = $nextphind;
941
942	# store total frequency
943	shift @fields;
944	$totalfrequency[$nextphind] = shift @fields;
945
946	$nextphind++;
947	}
948	close IN;
949
950
951	# Translate phrases file to phrases.2. Use phind keys (not suffix
952	# keys), sort expansion and document occurance data in order of
953	# descending frequency..
954	open(IN, "<$infile");
955	open(OUT, ">$outfile");
956
957	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
958	my (@documwents, @newexp, $k, $n);
959	my $linenumber = 0;
960
961	while(<IN>) {
962
963	# read the line
964	chomp;
965	@fields = split(/:/, $_);
966
967	# get a phrase number for this line
968	$suffixid = shift @fields;
969	die unless (defined($suffixtophind{$suffixid}));
970	$phindid = $suffixtophind{$suffixid};
971
972	# get the symbols in the phrase
973	$text = shift @fields;
974
975	# output status information
976	$linenumber++;
977	if ($verbosity > 2) {
978	if ($linenumber % 1000 == 0) {
979	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
980	}
981	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
982	}
983
984	# get the phrase frequency
985	$tf = shift @fields;
986
987	# get the number of expansions
988	$countexp = shift @fields;
989
990	# get the expansions, convert them into phind-id numbers, and sort them
991	$expansions = shift @fields;
992	@newexp = ();
993	foreach $k (split(/,/, $expansions)) {
994	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
995	$n = $suffixtophind{$k};
996	push @newexp, $n;
997	}
998	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
999
1000	# get the number of documents
1001	$countdocs = shift @fields;
1002
1003	# get the documents and sort them
1004	$documents = shift @fields;
1005	$documents =~ s/d//g;
1006	@documents = split(/;/, $documents);
1007	@documents = sort by_doc_frequency @documents;
1008
1009	# output the phrase data
1010	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1011	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1012
1013	}
1014
1015	close IN;
1016	close OUT;
1017	}
1018
1019
1020	# renumber_thesaurus_data
1021	#
1022	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1023
1024	sub renumber_thesaurus_data {
1025	my ($self) = @_;
1026
1027	my $out = $self->{'outhandle'};
1028	my $verbosity = $self->{'verbosity'};
1029	my $thesaurus = $self->{'thesaurus'};
1030
1031	my $phinddir = $self->{'phinddir'};
1032	my $infile = &util::filename_cat($phinddir, "phrases.2");
1033	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1034
1035
1036	# If no thesaurus is defined, simply move the phrases file.
1037	if (!$thesaurus) {
1038	print $out "Translate phrases.2: no thesaurus data\n"
1039	if ($verbosity);
1040	&util::mv($infile, $outfile);
1041	return;
1042	}
1043
1044	print $out "Translate phrases.2: add thesaurus data\n"
1045	if ($verbosity);
1046
1047	# 1.
1048	# Read thesaurus file and store (symbols->thesaurusid) mapping
1049	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1050	my %symbolstothesid;
1051	my (@fields, $thesid, $symbols);
1052
1053	open(TH, "<$thesaurusfile");
1054
1055	while (<TH>) {
1056
1057	chomp;
1058	@fields = split(/:/, $_);
1059
1060	# get id and text
1061	$thesid = shift @fields;
1062	$symbols = shift @fields;
1063	$symbolstothesid{$symbols} = $thesid;
1064	}
1065	close TH;
1066
1067	# 2.
1068	# Read phrases file to find thesaurus entries that already
1069	# have a phindid. Store their phind-ids for later translation.
1070	my %thesaurustophindid;
1071	my ($phindid);
1072
1073	open(IN, "<$infile");
1074
1075	while(<IN>) {
1076
1077	chomp;
1078	@fields = split(/:/, $_);
1079
1080	# phindid and symbols for this line
1081	$phindid = shift @fields;
1082	$symbols = shift @fields;
1083
1084	# do we have a thesaurus id corresponding to this phrase?
1085	if (defined($symbolstothesid{$symbols})) {
1086	$thesid = $symbolstothesid{$symbols};
1087	$thesaurustophindid{$thesid} = $phindid;
1088	}
1089	}
1090	close IN;
1091
1092	undef %symbolstothesid;
1093
1094	# 3.
1095	# Create phind-id numbers for remaining thesaurus entries
1096	my $nextphindid = $phindid + 1;
1097
1098	open(TH, "<$thesaurusfile");
1099	while(<TH>) {
1100
1101	chomp;
1102	@fields = split(/:/, $_);
1103
1104	# read thesaurus-id and ensure it has a corresponding phind-id
1105	$thesid = shift @fields;
1106	if (!defined($thesaurustophindid{$thesid})) {
1107	$thesaurustophindid{$thesid} = $nextphindid;
1108	$nextphindid++;
1109	}
1110	}
1111	close TH;
1112
1113	# 4.
1114	# Translate thesaurus file, replacing thesaurus-id numbers with
1115	# phind-id numbers.
1116	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1117	my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1118
1119	open(TH, "<$thesaurusfile");
1120	open(TO, ">$newthesaurusfile");
1121	while(<TH>) {
1122
1123	chomp;
1124	@fields = split(/:/, $_);
1125
1126	# phindid and symbols for this line
1127	($thesid, $symbols, $relations) = @fields;
1128
1129	die unless ($thesid && $symbols);
1130	die unless $thesaurustophindid{$thesid};
1131	$phindid = $thesaurustophindid{$thesid};
1132
1133	# convert each part of the relation string to use phind-id numbers
1134	$newrelation = "";
1135	$linkcounter = 0;
1136	foreach $linktext (split(/;/, $relations)) {
1137	@linkdata = split(/,/, $linktext);
1138
1139	# remember the linktype (e.g. BT, NT)
1140	$linktype = shift @linkdata;
1141	$newrelation .= "$linktype,";
1142
1143	# convert the link target identfiers
1144	foreach $link (@linkdata) {
1145	die unless (defined($thesaurustophindid{$link}));
1146	$newrelation .= "$thesaurustophindid{$link},";
1147	$linkcounter++;
1148	}
1149	$newrelation =~ s/\,$//;
1150	$newrelation .= ";";
1151	}
1152	$newrelation .= ":";
1153
1154	print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1155	}
1156	close TH;
1157	close TO;
1158
1159	undef %thesaurustophindid;
1160
1161	# 5.
1162	# Read thesaurus data (in phind-id format) into memory
1163	my %thesaurusdata;
1164
1165	open(TH, "<$newthesaurusfile");
1166	while(<TH>) {
1167	chomp;
1168	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1169	die unless ($phindid && $symbols);
1170	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1171	}
1172
1173	# 6.
1174	# Add thesaurus data to phrases file
1175	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1176	my (@documwents, @newexp, $k, $n);
1177	my $linenumber = 0;
1178
1179	open(IN, "<$infile");
1180	open(OUT, ">$outfile");
1181
1182	# Update existing phrases
1183	while(<IN>) {
1184
1185	chomp;
1186	@fields = split(/:/, $_);
1187
1188	# get data for this line
1189	$phindid = shift @fields;
1190
1191	# output the phrase data, with thesaurus information
1192	print OUT "$phindid:", join(":", @fields);
1193
1194	# add thesaurus data
1195	if (defined($thesaurusdata{$phindid})) {
1196	@fields = split(/:/, $thesaurusdata{$phindid});
1197	shift @fields;
1198	$linkcounter = shift @fields;
1199	$relations = shift @fields;
1200
1201	print OUT ":$linkcounter:$relations";
1202	$thesaurusdata{$phindid} = "";
1203	}
1204	print OUT "\n";
1205	}
1206	close IN;
1207
1208	# Add phrases that aren't already in the file
1209	foreach $phindid (sort numerically keys %thesaurusdata) {
1210	next unless ($thesaurusdata{$phindid});
1211
1212	@fields = split(/:/, $thesaurusdata{$phindid});
1213	$symbols = shift @fields;
1214	$linkcounter = shift @fields;
1215	$relations = shift @fields;
1216
1217	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1218	}
1219	close OUT;
1220
1221	}
1222
1223	# restore_vocabulary_data
1224	#
1225	# Read phrases.3 and restore vocabulary information. Then write
1226	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1227	# (if requested) to the saved phrases file.
1228
1229	sub restore_vocabulary_data {
1230	my ($self) = @_;
1231
1232	my $out = $self->{'outhandle'};
1233	my $verbosity = $self->{'verbosity'};
1234	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1235
1236	my $phinddir = $self->{'phinddir'};
1237	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1238	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1239	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1240	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1241
1242	my $savephrases = $self->{'savephrases'};
1243
1244	# 1.
1245	# Read the vocabulary file
1246	open(V, "<$vocabfile")
1247	\|\| die "Cannot open $vocabfile: $!";
1248	my @symbol;
1249	my $i = 1;
1250	while(<V>) {
1251	chomp;
1252	$symbol[$i++] = $_;
1253	}
1254
1255
1256	# 2.
1257	# Translate phrases.3 to MGPP input files
1258	my ($key, $text, $word);
1259	my @fields;
1260	my $linenumber = 0;
1261
1262	open(IN, "<$infile");
1263	open(DATA, ">$datafile");
1264	open(WORD, ">$wordfile");
1265
1266	# Save the phrases in a separate text file
1267	if ($savephrases) {
1268	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1269	open(SAVE, ">$savephrases");
1270	}
1271
1272	while(<IN>) {
1273
1274	# read the line
1275	chomp;
1276	$line = $_;
1277	@fields = split(/:/, $line);
1278
1279	# get a phrase number for this line
1280	$key = shift @fields;
1281
1282	# restore the text of the phrase
1283	$text = shift @fields;
1284	$text =~ s/s(\d+)/$symbol[$1]/g;
1285	if ($text =~ / /) {
1286	$word = "";
1287	} elsif ($text ne 'untranslated') {
1288	$word = $text;
1289	}
1290
1291	# output the phrase data
1292	print DATA "<Document>";
1293	print DATA "$key:$text:", join(":", @fields), ":\n";
1294
1295	# output the word index search data
1296	print WORD "<Document>$word\n";
1297
1298	# output the phrases to a text file
1299	if ($savephrases) {
1300	print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1301	}
1302	}
1303	close IN;
1304	close WORD;
1305	close DATA;
1306	close SAVE if ($savephrases);
1307
1308	}
1309
1310
1311
1312	# sort routines used to renumber phrases
1313
1314	sub numerically { $a <=> $b }
1315
1316	sub by_doc_frequency {
1317	my $fa = 1;
1318	if ($a =~ /,/) {
1319	$fa = $a;
1320	$fa =~ s/\d+,//;
1321	}
1322	my $fb = 1;
1323	if ($b =~ /,/) {
1324	$fb = $b;
1325	$fb =~ s/\d+,//;
1326	}
1327
1328	return ($fb <=> $fa);
1329	}
1330
1331	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: