Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 1646

Last change on this file since 1646 was 1646, checked in by paynter, 24 years ago
Arguments for setting suffix program parameters.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.0 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin
30	#
31	# options are:
32	# title=Title The title field for this classification
33	# text=fields The text used to build the phrase hierarchy
34	# phindexdir=directory Location of phind index files
35	# verbosity=num Control amount of output
36	# untidy=true Do not clean up intermediate files
37	# suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword)
38	# suffixsize=num Number of symbols available to suffix program
39
40
41	# How a classifier works.
42	#
43	# When a classifier is requested in the collect.cfg file, buildcol creates a
44	# new classifier object (such as the one defined in theis file) and later
45	# passes each document object to the classifier in turn. Four functions are
46	# used:
47	#
48	# 1. "new" is called before the documents are processed to set up the
49	# classifier.
50	#
51	# 2. "init" is called after buildcol.pl has created the indexes etc but
52	# before the documents are classified in order that the classifier might
53	# set any varioables it requiers, etc.
54	#
55	# 3. "classify" is called once for each document object. The classifier
56	# "classifies" each document and updates its local data accordingly.
57	#
58	# 4. "get_classify_info" is called after every document has been
59	# classified. It collates the information about the documents and
60	# stores a reference to the classifier so that Greenstone can later
61	# display it.
62
63
64	package phind;
65
66	use BasClas;
67	use util;
68
69	sub BEGIN {
70	@ISA = ('BasClas');
71	}
72
73	# Define delimiter symbols - this should be abstracted out someplace
74	my $colstart = "COLLECTIONSTART";
75	my $colend = "COLLECTIONEND";
76	my $doclimit = "DOCUMENTLIMIT";
77	my $senlimit = "SENTENCELIMIT";
78	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
79
80	# Create a new phind browser based on the options in collect.cfg
81
82	sub new {
83	my ($class, @options) = @_;
84	my $self = new BasClas ($class, @_);
85	my $out = pop @options;
86
87	# Phind installation check
88	# The phind phrase browser is research software and is not installed
89	# by defualt. If the user attepts to use it we warn them that it's a
90	# bit dodgy, then tell them how to install it. If they can do that
91	# and get all the files in place, then we let them proceed.
92
93	print $out "The Phind classifier for Greenstone.\n";
94	print $out "Checking the phind phrase browser requirements...\n";
95
96	# Make sure we're not in windows
97	if ($ENV{'GSDLOS'} =~ /windows/i) {
98	print STDERR "Phind currently only works under Unix";
99	exit(1);
100	}
101
102	# Ensure the Phind generate scripts are in place
103	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
104	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
105
106	if (!(-e $file1)) {
107	print STDERR "The phind \"suffix\" program is not installed. ";
108	print STDERR "To install it, change to the directory\n";
109	print STDERR " $src\n";
110	print STDERR "and type \"make install-phind\".\n\n";
111	exit(1);
112	}
113
114	# Ensure the Phind CGI script is in place
115	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
116	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
117
118	if (!(-e $file1)) {
119	print STDERR "The phind CGI program is not installed. ";
120	print STDERR "To install it, change to the directory\n";
121	print STDERR " $src\n";
122	print STDERR "and type \"make install-phind\".\n\n";
123	exit(1);
124	}
125
126	# Ensure the Phind Java applet is in place
127	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
128	$file1 = &util::filename_cat($src, "Phind.class");
129
130	if (!(-e $file1)) {
131	print STDERR "The phind Java classes are not compiled. ";
132	print STDERR "To compile them, change to the directory\n";
133	print STDERR " $src\n";
134	print STDERR "and use your Java compiler to compile Phind.java.\n";
135	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
136	exit(1);
137	}
138
139
140	# The installation appears OK - set up the classifier
141	my $collection = $ENV{'GSDLCOLLECTION'};
142	my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
143	my $language = "english";
144
145	my $title = "Topic";
146	my $indexes = "section:Title,section:text";
147
148	my $suffixmode = 1;
149	my $suffixsize = 40000000;
150
151	my $verbosity = 2;
152	my $untidy = 0;
153
154	# parse the options
155	foreach $option (@options) {
156
157	if ($option =~ /^text=(.*)$/i) {
158	$indexes = $1;
159	} elsif ($option =~ /^title=(.*)$/i) {
160	$title = $1;
161	} elsif ($option =~ /^phindexdir=(.*)$/i) {
162	$phindexdir = $1;
163	} elsif ($option =~ /^suffixsize=(.*)$/i) {
164	$suffixsize = $1;
165	} elsif ($option =~ /^suffixmode=(.*)$/i) {
166	$suffixmode = $1;
167	} elsif ($option =~ /^verbosity=(.*)$/i) {
168	$verbosity = $1;
169	} elsif ($option =~ /^untidy/i) {
170	$untidy = 1;
171	}
172	}
173
174
175	$self->{'collection'} = $collection;
176	$self->{'title'} = $title;
177	$self->{'indexes'} = $indexes;
178
179	$self->{'suffixmode'} = $suffixmode;
180	$self->{'suffixsize'} = $suffixsize;
181
182	$self->{'verbosity'} = $verbosity;
183	$self->{'untidy'} = $untidy;
184
185	# limit languages
186	$language =~ s/,/\\|/g;
187	$self->{'language_exp'} = $language;
188	$self->{'delimiter'} = $delimiter;
189
190	# reset phindex directory
191	if (-e "$phindexdir") {
192	&util::rm_r("$phindexdir");
193	}
194	&util::mk_dir("$phindexdir");
195	$self->{'phindexdir'} = $phindexdir;
196
197	return bless $self, $class;
198	}
199
200
201	# Initialise the phind classifier
202
203	sub init {
204	my $self = shift (@_);
205
206	# open filehandles for documents and text
207	my $phindexdir = $self->{'phindexdir'};
208
209	my $clausefile = &util::filename_cat("$phindexdir", "clauses");
210	&util::rm($clausefile) if (-e $clausefile);
211	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
212	$self->{'txthandle'} = TEXT;
213
214	my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
215	&util::rm($docfile) if (-e $docfile);
216	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
217	$self->{'dochandle'} = DOCS;
218
219	}
220
221	# Classify each document.
222	#
223	# Each document is passed here in turn. The classifier extracts the
224	# text of each and stores it in the clauses file. Document details are
225	# stored in the docs.txt file.
226
227	sub classify {
228	my $self = shift (@_);
229	my ($doc_obj) = @_;
230
231	my $verbosity = $self->{'verbosity'};
232	my $top_section = $doc_obj->get_top_section();
233
234	my $title = $doc_obj->get_metadata_element ($top_section, "Title");
235	print "process: $title\n" if ($verbosity > 2);
236
237
238	# only consider english-language files
239	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
240	my $phrlanguage = $self->{'language_exp'};
241	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
242
243	# record this file
244	my $total++;
245	print "file $total: $file\n" if ($self->{'$verbosity'});
246
247
248	# Store document details
249	my $OID = $doc_obj->get_OID();
250	$OID = "NULL" unless defined $OID;
251	my $dochandle = $self->{'dochandle'};
252	print $dochandle "<Document>\t$OID\t$title\n";
253
254	# Store the text occuring in this object
255
256	# output the document delimiter
257	my $txthandle = $self->{'txthandle'};
258	print $txthandle "$doclimit\n";
259
260	# iterarate over the required indexes and store their text
261	my $indexes = $self->{'indexes'};
262	my $text = "";
263	my ($part, $level, $field, $section, $data);
264
265	foreach $part (split(/,/, $indexes)) {
266
267	# Each field has a level and a data element ((e.g. document:Title)
268	($level, $field) = split(/:/, $part);
269	die unless ($level && $field);
270
271	# Extract the text from every section
272	# (In phind, document:text and section:text are equivalent)
273	if ($field eq "text") {
274	$data = "";
275	$section = $doc_obj->get_top_section();
276	while (defined($section)) {
277	$data .= $doc_obj->get_text($section) . "\n";
278	$section = $doc_obj->get_next_section($section);
279	}
280	$text .= convert_gml_to_tokens($data) . "\n";
281	}
282
283	# Extract a metadata field from a document
284	elsif ($level eq "document") {
285	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
286	$text .= convert_gml_to_tokens($data) . "\n";
287	}
288
289	# Extract metadata from every section in a document
290	elsif ($level eq "section") {
291	$data = "";
292	$section = $doc_obj->get_top_section();
293	while (defined($section)) {
294	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
295	$section = $doc_obj->get_next_section($section);
296	}
297	$text .= convert_gml_to_tokens($data) . "\n";
298	}
299
300	# Some sort of specification which I don't understand
301	else {
302	die "Unknown level ($level) in phind index ($part)\n";
303	}
304
305	}
306
307	# output the text
308	$text =~ tr/\n//s;
309	print $txthandle "$text";
310
311	}
312
313
314
315	# Construct the classifier from the information already gathered
316	#
317	# When get_classify_info is called, the clauses and docs.txt files have
318	# already been constructed in the phindex directory. This function will
319	# translate them into compressed, indexed MGPP files that can be read by
320	# the phindcgi script. It will also register our classifier so that it
321	# shows up in thenavigation bar.
322
323	sub get_classify_info {
324	my $self = shift (@_);
325
326	my $verbosity = $self->{'verbosity'};
327	my $phindexdir = $self->{'phindexdir'};
328	my $language = "english";
329
330	if ($verbosity) {
331	print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
332	}
333
334	# Construct phind indexes
335	my $suffixmode = $self->{'suffixmode'};
336	my $suffixsize = $self->{'suffixsize'};
337	my ($command, $status);
338
339	# Generate the vocabulary, symbol statistics, and numbers file
340	# from the clauses file
341	print "\nExtracting vocabulary and statistics\n" if $verbosity;
342	&extract_vocabulary($phindexdir, $language, $verbosity);
343
344	# Use the suffix program to generate the phindex/phrases file
345	print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
346	&execute("suffix $phindexdir $suffixsize $suffixmode", $verbosity);
347
348	# Create the phrase file and put phrase numbers in phindex/phrases
349	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
350	&renumber_phrases("$phindexdir", $verbosity);
351
352	# Create the mg phrase database
353	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
354	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
355	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
356
357	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
358	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
359	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
360	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
361
362	print "\nCreating phrase databases\n";
363	my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
364	my $mg_stem = "pdata";
365
366	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
367	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
368	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
369
370	# create the mg index of words
371	print "\nCreating word-level search indexes\n";
372	$mg_input = &util::filename_cat($phindexdir, "pword.txt");
373	$mg_stem = "pword";
374
375	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
376	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
377	&execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
378	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
379	&execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
380	&execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
381
382	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
383	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
384	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
385
386	# create the mg document information database
387	print "\nCreating document information databases\n";
388	$mg_input = &util::filename_cat($phindexdir, "docs.txt");
389	$mg_stem = "docs";
390
391	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
392	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
393	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
394
395
396	# Tidy up stray files
397	if (!$untidy) {
398	print "\nCleaning up\n" if ($verbosity > 2);
399	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
400	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
401	"$phindexdir/phrases", "$phindexdir/docs.txt",
402	"$phindexdir/pdata.txt", "$phindexdir/pword.txt");
403	my $outfile = 1;
404	while (-e "$phindexdir/outPhrase.$outfile") {
405	&util::rm("$phindexdir/outPhrase.$outfile");
406	$outfile++;
407	}
408	}
409
410
411	# Insert the classifier into.... what?
412	my $collection = $self->{'collection'};
413	my $url = "library?a=p&p=phind&c=$collection";
414
415	my %classifyinfo = ('thistype'=>'Invisible',
416	'childtype'=>'Phind',
417	'Title'=>$self->{'title'},
418	'contains'=>[]);
419
420	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
421	return \%classifyinfo;
422	}
423
424
425
426	sub convert_gml_to_tokens {
427
428	$_ = shift @_;
429
430	# FIRST, remove GML tags
431
432	# Replace all whitespace with a simple space
433	s/\s+/ /gs;
434
435	# Remove everything that is in a tag
436	s/\s<p>\s/ PARAGRAPHBREAK /isg;
437	s/\s<br>\s/ LINEBREAK /isg;
438	s/<[^>]*>/ /sg;
439
440	# Now we have the text, but it may contain HTML
441	# elements coded as > etc. Remove these tags.
442	s/</</sg;
443	s/>/>/sg;
444
445	s/\s+/ /sg;
446	s/\s<p>\s/ PARAGRAPHBREAK /isg;
447	s/\s<br>\s/ LINEBREAK /isg;
448	s/<[^>]*>/ /sg;
449
450	# remove & and other miscellaneous markup tags
451	s/&/&/sg;
452	s/</</sg;
453	s/>/>/sg;
454	s/&/&/sg;
455
456	# replace<p> and <br> placeholders with carriage returns
457	s/PARAGRAPHBREAK/\n/sg;
458	s/LINEBREAK/\n/sg;
459
460
461	# Exceptional punctuation
462	#
463	# We make special cases of some punctuation
464
465	# remove any apostrophe that indicates omitted letters
466	s/(\w+)\'(\w*\s)/ $1$2 /g;
467
468	# remove period that appears in a person's initals
469	s/\s([A-Z])\./ $1 /g;
470
471	# replace hyphens in hypheanted words and names with a space
472	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
473
474
475	# Convert the remaining text to "clause format",
476	# This means removing all excess punctuation and garbage text,
477	# normalising valid punctuation to fullstops and commas,
478	# then putting one cluse on each line.
479
480	# Insert newline when the end of a sentence is detected
481	# (delimter is: "[\.\?\!]\s")
482	s/\s*[\.\?\!]\s+/\n/g;
483
484	# split numbers after four digits
485	s/(\d\d\d\d)/$1 /g;
486
487	# split words after 32 characters
488
489	# squash repeated punctuation
490	tr/A-Za-z0-9 //cs;
491
492	# save email addresses
493	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
494
495	# normalise clause breaks (mostly punctuation symbols) to commas
496	s/[^A-Za-z0-9 \n]+/ , /g;
497
498	# Remove repeated commas, and replace with newline
499	s/\s*,[, ]+/\n/g;
500
501	# remove extra whitespace
502	s/ +/ /sg;
503	s/^\s+//mg;
504	s/\s*$/\n/mg;
505
506	# remove lines that contain one word or less
507	s/^\w*$//mg;
508	s/^\s*$//mg;
509	tr/\n//s;
510
511	return $_;
512	}
513
514
515	# Execute a system command
516
517	sub execute {
518	my ($command, $verbosity) = @_;
519	print "Executing: $command\n" if ($verbosity > 2);
520	my $status = system($command);
521	if ($status != 0) {
522	print STDERR "phindgen.pl - Error executing $command: $!\n";
523	exit($status);
524	}
525	}
526
527
528	# Generate the vocabulary, symbol statistics, and numbers file from the
529	# clauses file. This is legacy code, so is a bit messy and probably wont
530	# run under windows.
531
532	sub extract_vocabulary {
533	my ($phindex_dir, $language, $verbosity) = @_;
534
535	my ($w, $l, $line, $word);
536
537	my ($first_delimiter, $last_delimiter,
538	$first_stopword, $last_stopword,
539	$first_extractword, $last_extractword,
540	$first_contentword, $last_contentword,
541	$phrasedelimiter);
542
543	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
544
545
546	my %symbol;
547	my (%freq);
548
549	print "Calculating vocabulary\n" if ($verbosity > 1);
550
551	# Read and store the stopwords
552	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
553	my %stopwords;
554	foreach my $w (split(/\s+/, $words)) {
555	$l = lc($w);
556	$stopwords{$l} = $w;
557	}
558
559	# Read and store the thesaurus terms
560	$use_thesaurus = 0;
561	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
562	"$language", "agrovoc.lex");
563	if (-e "$lex_file") {
564	open(TH, "<$lex_file");
565	while(<TH>) {
566	s/^\d+ //;
567	s/\(.*\)//;
568	foreach my $w (split(/\s+/, $_)) {
569	$thesaurus{lc($w)} = $w;
570	}
571	}
572	close TH;
573	$use_thesaurus = 1;
574	}
575
576	# Read words in the text and count occurences
577	open(TXT, "<$phindex_dir/clauses");
578	my @words;
579
580	while(<TXT>) {
581	$line = $_;
582	next unless ($line =~ /./);
583
584	@words = split(/\s+/, $line);
585	foreach $w (@words) {
586	$l = lc($w);
587	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
588	$freq{$w}++;
589	}
590	$freq{$senlimit}++;
591	}
592
593	# Calculate the "best" form of each word
594	my (%bestform, %totalfreq, %bestfreq);
595
596	foreach $w (sort (keys %freq)) {
597	$l = lc($w);
598
599	# totalfreq is the number of times a term appears in any form
600	$totalfreq{$l} += $freq{$w};
601
602	if (defined $stopwords{$l}) {
603	$bestform{$l} = $stopwords{$l};
604
605	} elsif (defined $thesaurus{$l}) {
606	$bestform{$l} = $thesaurus{$l};
607
608	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
609	$bestfreq{$l} = $freq{$w};
610	$bestform{$l} = $w;
611	}
612	}
613
614	undef %freq;
615	undef %bestfreq;
616
617
618	# Assign symbol numbers to tokens
619	my $nextsymbol = 1;
620	my (@vocab);
621
622	# Delimiters
623	$first_delimiter = 1;
624
625	foreach $word (@delimiters) {
626
627	$word = lc($word);
628	$bestform{$word} = uc($word);
629	$vocab[$nextsymbol] = $word;
630	$symbol{$word} = $nextsymbol;
631	$nextsymbol++;
632	}
633	$last_delimiter = $nextsymbol - 1;
634
635	# Stopwords
636	$first_stopword = $nextsymbol;
637
638	foreach my $word (sort keys %stopwords) {
639
640	# don't incluse stopword unless it occurs in the text
641	$word = lc($word);
642	next unless ($totalfreq{$word});
643	next if ($symbol{$word});
644
645	$vocab[$nextsymbol] = $word;
646	$symbol{$word} = $nextsymbol;
647	$nextsymbol++;
648	}
649	$last_stopword = $nextsymbol - 1;
650	$first_contentword = $nextsymbol;
651
652	# Thesaurus terms
653	if ($use_thesaurus) {
654	$first_thesaurusword = $nextsymbol;
655
656	foreach my $word (sort keys %thesaurus) {
657
658	$word = lc($word);
659	next if ($symbol{$word});
660	$bestform{$word} = $thesaurus{$word};
661
662	$vocab[$nextsymbol] = $word;
663	$symbol{$word} = $nextsymbol;
664	$nextsymbol++;
665
666	}
667	$last_thesaurusword = $nextsymbol - 1;
668	}
669
670	# Other content words
671	$first_extractword = $nextsymbol;
672
673	foreach my $word (sort (keys %bestform)) {
674
675	next if ($symbol{$word});
676
677	$vocab[$nextsymbol] = $word;
678	$symbol{$word} = $nextsymbol;
679	$nextsymbol++;
680	}
681	$last_extractword = $nextsymbol - 1;
682	$last_contentword = $nextsymbol - 1;
683
684
685	# Outut the words
686	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
687	open(VOC, ">$phindex_dir/clauses.vocab");
688
689	for (my $i = 1; $i < $nextsymbol; $i++) {
690	$w = $vocab[$i];
691
692	print VOC "$bestform{$w}\n";
693	$totalfreq{$w} = 0 unless ($totalfreq{$w});
694	}
695	close VOC;
696
697
698	# Output statistics about the vocablary
699	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
700	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
701	open(STAT, ">$phindex_dir/clauses.stats")
702	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
703
704	print STAT "first_delimiter $first_delimiter\n";
705	print STAT "last_delimiter $last_delimiter\n";
706	print STAT "first_stopword $first_stopword\n";
707	print STAT "last_stopword $last_stopword\n";
708	if ($use_thesaurus) {
709	print STAT "first_thesaurusword $first_thesaurusword\n";
710	print STAT "last_thesaurusword $last_thesaurusword\n";
711	}
712	print STAT "first_extractword $first_extractword\n";
713	print STAT "last_extractword $last_extractword\n";
714	print STAT "first_contentword $first_contentword\n";
715	print STAT "last_contentword $last_contentword\n";
716	print STAT "first_symbol $first_delimiter\n";
717	print STAT "last_symbol $last_contentword\n";
718	print STAT "first_word $first_stopword\n";
719	print STAT "last_word $last_contentword\n";
720	close STAT;
721
722	undef @vocab;
723
724
725	# Save text as symbol numbers
726	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
727
728	open(TXT, "<$phindex_dir/clauses");
729	open(NUM, ">$phindex_dir/clauses.numbers");
730
731	$phrasedelimiter = $symbol{lc($senlimit)};
732	print NUM "$symbol{lc($colstart)}\n";
733
734	# set up the special symbols that delimit documents and sentences
735	while(<TXT>) {
736
737	# split sentence into a list of tokens
738	$line = $_;
739	next unless ($line =~ /./);
740	@words = split(/\s+/, $line);
741
742	# output one token at a time
743	foreach $word (@words) {
744	$word = lc($word);
745	print NUM "$symbol{$word}\n";
746	}
747
748	# output phrase delimiter
749	print NUM "$phrasedelimiter\n";
750	}
751
752	print NUM "$symbol{lc($colend)}\n";
753
754	}
755
756
757	# Prepare the phrases file to be input to mgpp.
758	# This means renumbering the phrases in order of decreasing frequency.
759
760
761	# This is legacy code, and a little ugly, and may be unix-specific
762	# (particularly the sort command).
763
764	sub renumber_phrases {
765	my ($phindex_dir, $verbosity) = @_;
766
767	# Sort the phrases into order of increasing frequency
768	# This means the expansions will be sorted correctly later on.
769	print "Sorting phrases into freq order\n" if ($verbosity);
770	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
771
772	my @symbol;
773
774	# Read the vocabulary
775	print "Reading the vocabulary\n" if ($verbosity);
776	open(V, "<$phindex_dir/clauses.vocab")
777	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
778
779	my $i = 1;
780	while(<V>) {
781	chomp;
782	$symbol[$i++] = $_;
783	}
784
785	# Create file for phrase data
786	#
787	# The phrases file looks something like this
788	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
789	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
790	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
791	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
792
793	# The first field on each line is a unique phrase identifier.
794	# We need to calculate phrase numbers for each phrase
795	print "Calculate phrase numbers\n" if ($verbosity);
796
797	my %phrasenumber;
798	my $nextphrase = 1;
799	my ($line);
800
801	open(IN, "<$phindex_dir/phrases");
802	while(<IN>) {
803
804	# read the line
805	chomp;
806	$line = $_;
807
808	# we're only interested in the first field
809	$line =~ s/:.*//;
810
811	# get a phrase number for this line
812	$phrasenumber{$line} = $nextphrase;
813	$nextphrase++;
814	}
815
816
817	# Now we create a new phrase file using phrase numbers, not the old IDs.
818	print "Format phrase data for MGPP\n" if ($verbosity);
819
820	open(IN, "<$phindex_dir/phrases");
821	open(DATA, ">$phindex_dir/pdata.txt");
822	open(IDX, ">$phindex_dir/pword.txt");
823
824	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
825	my @fields;
826	my @documents;
827	my (@newexp, $k, $n);
828
829	my $linenumber = 0;
830
831	while(<IN>) {
832
833	# read the line
834	chomp;
835	$line = $_;
836	@fields = split(/:/, $line);
837
838	# get a phrase number for this line
839	$key = shift @fields;
840	die unless (defined($phrasenumber{$key}));
841	$num = $phrasenumber{$key};
842
843	# get the text of the phrase
844	$text = shift @fields;
845	$text =~ s/s(\d+)/$symbol[$1]/g;
846	if ($text =~ / /) {
847	$word = "";
848	} else {
849	$word = $text;
850	}
851
852	$linenumber++;
853	if ($linenumber % 1000 == 0) {
854	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
855	}
856	print "$num: $key\t($text)\n" if ($verbosity > 3);
857
858	# get the phrase frequency
859	$tf = shift @fields;
860
861	# get the number of expansions
862	$countexp = shift @fields;
863
864	# get the expansions and convert them into phrase numbers
865	$expansions = shift @fields;
866	@newexp = ();
867	foreach $k (split(/,/, $expansions)) {
868	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
869	$n = $phrasenumber{$k};
870	push @newexp, $n;
871	}
872	@newexp = sort numerically @newexp;
873
874	# get the number of documents
875	$countdocs = shift @fields;
876
877	# get the documents
878	$documents = shift @fields;
879	$documents =~ s/d//g;
880	@documents = split(/;/, $documents);
881	@documents = sort by_frequency @documents;
882
883	# output the phrase data
884	print DATA "<Document>";
885	print DATA "$num:$text:$tf:$countexp:$countdocs:";
886	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
887
888	# output the word index search data
889	print IDX "<Document>$word\n";
890
891
892	}
893	}
894
895	# sort routines used to renumber phrases
896
897	sub numerically { $a <=> $b }
898
899	sub by_frequency {
900	my $fa = 1;
901	if ($a =~ /,/) {
902	$fa = $a;
903	$fa =~ s/\d+,//;
904	}
905	my $fb = 1;
906	if ($b =~ /,/) {
907	$fb = $b;
908	$fb =~ s/\d+,//;
909	}
910
911	return ($fb <=> $fa);
912	}
913
914
915	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: