Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 1645

Last change on this file since 1645 was 1645, checked in by paynter, 24 years ago
Output less verbose & more consistant with buildcol.pl
Property svn:keywords set to `Author Date Id Revision`
File size: 25.5 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin
30	#
31	# options are:
32	# title=Title The title field for this classification
33	# text=fields The text used to build the phrase hierarchy
34	# untidy=true Do not clean up intermediate files
35	# phindexdir=directory Location of phind index files
36	# verbosity=num Control amount of output
37
38
39	# How a classifier works.
40	#
41	# When a classifier is requested in the collect.cfg file, buildcol creates a
42	# new classifier object (such as the one defined in theis file) and later
43	# passes each document object to the classifier in turn. Four functions are
44	# used:
45	#
46	# 1. "new" is called before the documents are processed to set up the
47	# classifier.
48	#
49	# 2. "init" is called after buildcol.pl has created the indexes etc but
50	# before the documents are classified in order that the classifier might
51	# set any varioables it requiers, etc.
52	#
53	# 3. "classify" is called once for each document object. The classifier
54	# "classifies" each document and updates its local data accordingly.
55	#
56	# 4. "get_classify_info" is called after every document has been
57	# classified. It collates the information about the documents and
58	# stores a reference to the classifier so that Greenstone can later
59	# display it.
60
61
62	package phind;
63
64	use BasClas;
65	use util;
66
67	sub BEGIN {
68	@ISA = ('BasClas');
69	}
70
71	# Define delimiter symbols - this should be abstracted out someplace
72	my $colstart = "COLLECTIONSTART";
73	my $colend = "COLLECTIONEND";
74	my $doclimit = "DOCUMENTLIMIT";
75	my $senlimit = "SENTENCELIMIT";
76	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
77
78	# Create a new phind browser based on the options in collect.cfg
79
80	sub new {
81	my ($class, @options) = @_;
82	my $self = new BasClas ($class, @_);
83	my $out = pop @options;
84
85	# Phind installation check
86	# The phind phrase browser is research software and is not installed
87	# by defualt. If the user attepts to use it we warn them that it's a
88	# bit dodgy, then tell them how to install it. If they can do that
89	# and get all the files in place, then we let them proceed.
90
91	print $out "The Phind classifier for Greenstone.\n";
92	print $out "Checking the phind phrase browser requirements...\n";
93
94	# Make sure we're not in windows
95	if ($ENV{'GSDLOS'} =~ /windows/i) {
96	print STDERR "Phind currently only works under Unix";
97	exit(1);
98	}
99
100	# Ensure the Phind generate scripts are in place
101	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
102	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
103
104	if (!(-e $file1)) {
105	print STDERR "The phind \"suffix\" program is not installed. ";
106	print STDERR "To install it, change to the directory\n";
107	print STDERR " $src\n";
108	print STDERR "and type \"make install-phind\".\n\n";
109	exit(1);
110	}
111
112	# Ensure the Phind CGI script is in place
113	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
114	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
115
116	if (!(-e $file1)) {
117	print STDERR "The phind CGI program is not installed. ";
118	print STDERR "To install it, change to the directory\n";
119	print STDERR " $src\n";
120	print STDERR "and type \"make install-phind\".\n\n";
121	exit(1);
122	}
123
124	# Ensure the Phind Java applet is in place
125	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
126	$file1 = &util::filename_cat($src, "Phind.class");
127
128	if (!(-e $file1)) {
129	print STDERR "The phind Java classes are not compiled. ";
130	print STDERR "To compile them, change to the directory\n";
131	print STDERR " $src\n";
132	print STDERR "and use your Java compiler to compile Phind.java.\n";
133	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
134	exit(1);
135	}
136
137
138	# The installation appears OK - set up the classifier
139	my $collection = $ENV{'GSDLCOLLECTION'};
140	my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
141	my $language = "english";
142
143	my $title = "Topic";
144	my $indexes = "section:Title,section:text";
145
146	my $verbosity = 2;
147	my $untidy = 0;
148
149	# parse the options
150	foreach $option (@options) {
151
152	if ($option =~ /^text=(.*)$/i) {
153	$indexes = $1;
154	} elsif ($option =~ /^title=(.*)$/i) {
155	$title = $1;
156	} elsif ($option =~ /^phindexdir=(.*)$/i) {
157	$phindexdir = $1;
158	} elsif ($option =~ /^verbosity=(.*)$/i) {
159	$verbosity = $1;
160	} elsif ($option =~ /^untidy/i) {
161	$untidy = 1;
162	}
163	}
164
165
166	$self->{'collection'} = $collection;
167	$self->{'title'} = $title;
168	$self->{'indexes'} = $indexes;
169	$self->{'verbosity'} = $verbosity;
170	$self->{'untidy'} = $untidy;
171
172	# limit languages
173	$language =~ s/,/\\|/g;
174	$self->{'language_exp'} = $language;
175	$self->{'delimiter'} = $delimiter;
176
177	# reset phindex directory
178	if (-e "$phindexdir") {
179	&util::rm_r("$phindexdir");
180	}
181	&util::mk_dir("$phindexdir");
182	$self->{'phindexdir'} = $phindexdir;
183
184	return bless $self, $class;
185	}
186
187
188	# Initialise the phind classifier
189
190	sub init {
191	my $self = shift (@_);
192
193	# open filehandles for documents and text
194	my $phindexdir = $self->{'phindexdir'};
195
196	my $clausefile = &util::filename_cat("$phindexdir", "clauses");
197	&util::rm($clausefile) if (-e $clausefile);
198	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
199	$self->{'txthandle'} = TEXT;
200
201	my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
202	&util::rm($docfile) if (-e $docfile);
203	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
204	$self->{'dochandle'} = DOCS;
205
206	}
207
208	# Classify each document.
209	#
210	# Each document is passed here in turn. The classifier extracts the
211	# text of each and stores it in the clauses file. Document details are
212	# stored in the docs.txt file.
213
214	sub classify {
215	my $self = shift (@_);
216	my ($doc_obj) = @_;
217
218	my $verbosity = $self->{'verbosity'};
219	my $top_section = $doc_obj->get_top_section();
220
221	my $title = $doc_obj->get_metadata_element ($top_section, "Title");
222	print "process: $title\n" if ($verbosity > 2);
223
224
225	# only consider english-language files
226	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
227	my $phrlanguage = $self->{'language_exp'};
228	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
229
230	# record this file
231	my $total++;
232	print "file $total: $file\n" if ($self->{'$verbosity'});
233
234
235	# Store document details
236	my $OID = $doc_obj->get_OID();
237	$OID = "NULL" unless defined $OID;
238	my $dochandle = $self->{'dochandle'};
239	print $dochandle "<Document>\t$OID\t$title\n";
240
241	# Store the text occuring in this object
242
243	# output the document delimiter
244	my $txthandle = $self->{'txthandle'};
245	print $txthandle "$doclimit\n";
246
247	# iterarate over the required indexes and store their text
248	my $indexes = $self->{'indexes'};
249	my $text = "";
250	my ($part, $level, $field, $section, $data);
251
252	foreach $part (split(/,/, $indexes)) {
253
254	# Each field has a level and a data element ((e.g. document:Title)
255	($level, $field) = split(/:/, $part);
256	die unless ($level && $field);
257
258	# Extract the text from every section
259	# (In phind, document:text and section:text are equivalent)
260	if ($field eq "text") {
261	$data = "";
262	$section = $doc_obj->get_top_section();
263	while (defined($section)) {
264	$data .= $doc_obj->get_text($section) . "\n";
265	$section = $doc_obj->get_next_section($section);
266	}
267	$text .= convert_gml_to_tokens($data) . "\n";
268	}
269
270	# Extract a metadata field from a document
271	elsif ($level eq "document") {
272	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
273	$text .= convert_gml_to_tokens($data) . "\n";
274	}
275
276	# Extract metadata from every section in a document
277	elsif ($level eq "section") {
278	$data = "";
279	$section = $doc_obj->get_top_section();
280	while (defined($section)) {
281	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
282	$section = $doc_obj->get_next_section($section);
283	}
284	$text .= convert_gml_to_tokens($data) . "\n";
285	}
286
287	# Some sort of specification which I don't understand
288	else {
289	die "Unknown level ($level) in phind index ($part)\n";
290	}
291
292	}
293
294	# output the text
295	$text =~ tr/\n//s;
296	print $txthandle "$text";
297
298	}
299
300
301
302	# Construct the classifier from the information already gathered
303	#
304	# When get_classify_info is called, the clauses and docs.txt files have
305	# already been constructed in the phindex directory. This function will
306	# translate them into compressed, indexed MGPP files that can be read by
307	# the phindcgi script. It will also register our classifier so that it
308	# shows up in thenavigation bar.
309
310	sub get_classify_info {
311	my $self = shift (@_);
312
313	my $verbosity = $self->{'verbosity'};
314	my $phindexdir = $self->{'phindexdir'};
315	my $language = "english";
316
317	if ($verbosity) {
318	print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
319	}
320
321	# Construct phind indexes
322	my $mode = 1;
323	my $symbol_limit = 40000000;
324	my ($command, $status);
325
326	# Generate the vocabulary, symbol statistics, and numbers file
327	# from the clauses file
328	print "\nExtracting vocabulary and statistics\n" if $verbosity;
329	&extract_vocabulary($phindexdir, $language, $verbosity);
330
331	# Use the suffix program to generate the phindex/phrases file
332	print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
333	&execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
334
335	# Create the phrase file and put phrase numbers in phindex/phrases
336	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
337	&renumber_phrases("$phindexdir", $verbosity);
338
339	# Create the mg phrase database
340	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
341	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
342	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
343
344	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
345	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
346	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
347	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
348
349	print "\nCreating phrase databases\n";
350	my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
351	my $mg_stem = "pdata";
352
353	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
354	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
355	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
356
357	# create the mg index of words
358	print "\nCreating word-level search indexes\n";
359	$mg_input = &util::filename_cat($phindexdir, "pword.txt");
360	$mg_stem = "pword";
361
362	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
363	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
364	&execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
365	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
366	&execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
367	&execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
368
369	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
370	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
371	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
372
373	# create the mg document information database
374	print "\nCreating document information databases\n";
375	$mg_input = &util::filename_cat($phindexdir, "docs.txt");
376	$mg_stem = "docs";
377
378	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
379	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
380	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
381
382
383	# Tidy up stray files
384	if (!$untidy) {
385	print "\nCleaning up\n" if ($verbosity > 2);
386	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
387	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
388	"$phindexdir/phrases", "$phindexdir/docs.txt",
389	"$phindexdir/pdata.txt", "$phindexdir/pword.txt");
390	my $outfile = 1;
391	while (-e "$phindexdir/outPhrase.$outfile") {
392	&util::rm("$phindexdir/outPhrase.$outfile");
393	$outfile++;
394	}
395	}
396
397
398	# Insert the classifier into.... what?
399	my $collection = $self->{'collection'};
400	my $url = "library?a=p&p=phind&c=$collection";
401
402	my %classifyinfo = ('thistype'=>'Invisible',
403	'childtype'=>'Phind',
404	'Title'=>$self->{'title'},
405	'contains'=>[]);
406
407	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
408	return \%classifyinfo;
409	}
410
411
412
413	sub convert_gml_to_tokens {
414
415	$_ = shift @_;
416
417	# FIRST, remove GML tags
418
419	# Replace all whitespace with a simple space
420	s/\s+/ /gs;
421
422	# Remove everything that is in a tag
423	s/\s<p>\s/ PARAGRAPHBREAK /isg;
424	s/\s<br>\s/ LINEBREAK /isg;
425	s/<[^>]*>/ /sg;
426
427	# Now we have the text, but it may contain HTML
428	# elements coded as > etc. Remove these tags.
429	s/</</sg;
430	s/>/>/sg;
431
432	s/\s+/ /sg;
433	s/\s<p>\s/ PARAGRAPHBREAK /isg;
434	s/\s<br>\s/ LINEBREAK /isg;
435	s/<[^>]*>/ /sg;
436
437	# remove & and other miscellaneous markup tags
438	s/&/&/sg;
439	s/</</sg;
440	s/>/>/sg;
441	s/&/&/sg;
442
443	# replace<p> and <br> placeholders with carriage returns
444	s/PARAGRAPHBREAK/\n/sg;
445	s/LINEBREAK/\n/sg;
446
447
448	# Exceptional punctuation
449	#
450	# We make special cases of some punctuation
451
452	# remove any apostrophe that indicates omitted letters
453	s/(\w+)\'(\w*\s)/ $1$2 /g;
454
455	# remove period that appears in a person's initals
456	s/\s([A-Z])\./ $1 /g;
457
458	# replace hyphens in hypheanted words and names with a space
459	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
460
461
462	# Convert the remaining text to "clause format",
463	# This means removing all excess punctuation and garbage text,
464	# normalising valid punctuation to fullstops and commas,
465	# then putting one cluse on each line.
466
467	# Insert newline when the end of a sentence is detected
468	# (delimter is: "[\.\?\!]\s")
469	s/\s*[\.\?\!]\s+/\n/g;
470
471	# split numbers after four digits
472	s/(\d\d\d\d)/$1 /g;
473
474	# split words after 32 characters
475
476	# squash repeated punctuation
477	tr/A-Za-z0-9 //cs;
478
479	# save email addresses
480	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
481
482	# normalise clause breaks (mostly punctuation symbols) to commas
483	s/[^A-Za-z0-9 \n]+/ , /g;
484
485	# Remove repeated commas, and replace with newline
486	s/\s*,[, ]+/\n/g;
487
488	# remove extra whitespace
489	s/ +/ /sg;
490	s/^\s+//mg;
491	s/\s*$/\n/mg;
492
493	# remove lines that contain one word or less
494	s/^\w*$//mg;
495	s/^\s*$//mg;
496	tr/\n//s;
497
498	return $_;
499	}
500
501
502	# Execute a system command
503
504	sub execute {
505	my ($command, $verbosity) = @_;
506	print "Executing: $command\n" if ($verbosity > 2);
507	my $status = system($command);
508	if ($status != 0) {
509	print STDERR "phindgen.pl - Error executing $command: $!\n";
510	exit($status);
511	}
512	}
513
514
515	# Generate the vocabulary, symbol statistics, and numbers file from the
516	# clauses file. This is legacy code, so is a bit messy and probably wont
517	# run under windows.
518
519	sub extract_vocabulary {
520	my ($phindex_dir, $language, $verbosity) = @_;
521
522	my ($w, $l, $line, $word);
523
524	my ($first_delimiter, $last_delimiter,
525	$first_stopword, $last_stopword,
526	$first_extractword, $last_extractword,
527	$first_contentword, $last_contentword,
528	$phrasedelimiter);
529
530	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
531
532
533	my %symbol;
534	my (%freq);
535
536	print "Calculating vocabulary\n" if ($verbosity > 1);
537
538	# Read and store the stopwords
539	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
540	my %stopwords;
541	foreach my $w (split(/\s+/, $words)) {
542	$l = lc($w);
543	$stopwords{$l} = $w;
544	}
545
546	# Read and store the thesaurus terms
547	$use_thesaurus = 0;
548	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
549	"$language", "agrovoc.lex");
550	if (-e "$lex_file") {
551	open(TH, "<$lex_file");
552	while(<TH>) {
553	s/^\d+ //;
554	s/\(.*\)//;
555	foreach my $w (split(/\s+/, $_)) {
556	$thesaurus{lc($w)} = $w;
557	}
558	}
559	close TH;
560	$use_thesaurus = 1;
561	}
562
563	# Read words in the text and count occurences
564	open(TXT, "<$phindex_dir/clauses");
565	my @words;
566
567	while(<TXT>) {
568	$line = $_;
569	next unless ($line =~ /./);
570
571	@words = split(/\s+/, $line);
572	foreach $w (@words) {
573	$l = lc($w);
574	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
575	$freq{$w}++;
576	}
577	$freq{$senlimit}++;
578	}
579
580	# Calculate the "best" form of each word
581	my (%bestform, %totalfreq, %bestfreq);
582
583	foreach $w (sort (keys %freq)) {
584	$l = lc($w);
585
586	# totalfreq is the number of times a term appears in any form
587	$totalfreq{$l} += $freq{$w};
588
589	if (defined $stopwords{$l}) {
590	$bestform{$l} = $stopwords{$l};
591
592	} elsif (defined $thesaurus{$l}) {
593	$bestform{$l} = $thesaurus{$l};
594
595	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
596	$bestfreq{$l} = $freq{$w};
597	$bestform{$l} = $w;
598	}
599	}
600
601	undef %freq;
602	undef %bestfreq;
603
604
605	# Assign symbol numbers to tokens
606	my $nextsymbol = 1;
607	my (@vocab);
608
609	# Delimiters
610	$first_delimiter = 1;
611
612	foreach $word (@delimiters) {
613
614	$word = lc($word);
615	$bestform{$word} = uc($word);
616	$vocab[$nextsymbol] = $word;
617	$symbol{$word} = $nextsymbol;
618	$nextsymbol++;
619	}
620	$last_delimiter = $nextsymbol - 1;
621
622	# Stopwords
623	$first_stopword = $nextsymbol;
624
625	foreach my $word (sort keys %stopwords) {
626
627	# don't incluse stopword unless it occurs in the text
628	$word = lc($word);
629	next unless ($totalfreq{$word});
630	next if ($symbol{$word});
631
632	$vocab[$nextsymbol] = $word;
633	$symbol{$word} = $nextsymbol;
634	$nextsymbol++;
635	}
636	$last_stopword = $nextsymbol - 1;
637	$first_contentword = $nextsymbol;
638
639	# Thesaurus terms
640	if ($use_thesaurus) {
641	$first_thesaurusword = $nextsymbol;
642
643	foreach my $word (sort keys %thesaurus) {
644
645	$word = lc($word);
646	next if ($symbol{$word});
647	$bestform{$word} = $thesaurus{$word};
648
649	$vocab[$nextsymbol] = $word;
650	$symbol{$word} = $nextsymbol;
651	$nextsymbol++;
652
653	}
654	$last_thesaurusword = $nextsymbol - 1;
655	}
656
657	# Other content words
658	$first_extractword = $nextsymbol;
659
660	foreach my $word (sort (keys %bestform)) {
661
662	next if ($symbol{$word});
663
664	$vocab[$nextsymbol] = $word;
665	$symbol{$word} = $nextsymbol;
666	$nextsymbol++;
667	}
668	$last_extractword = $nextsymbol - 1;
669	$last_contentword = $nextsymbol - 1;
670
671
672	# Outut the words
673	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
674	open(VOC, ">$phindex_dir/clauses.vocab");
675
676	for (my $i = 1; $i < $nextsymbol; $i++) {
677	$w = $vocab[$i];
678
679	print VOC "$bestform{$w}\n";
680	$totalfreq{$w} = 0 unless ($totalfreq{$w});
681	}
682	close VOC;
683
684
685	# Output statistics about the vocablary
686	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
687	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
688	open(STAT, ">$phindex_dir/clauses.stats")
689	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
690
691	print STAT "first_delimiter $first_delimiter\n";
692	print STAT "last_delimiter $last_delimiter\n";
693	print STAT "first_stopword $first_stopword\n";
694	print STAT "last_stopword $last_stopword\n";
695	if ($use_thesaurus) {
696	print STAT "first_thesaurusword $first_thesaurusword\n";
697	print STAT "last_thesaurusword $last_thesaurusword\n";
698	}
699	print STAT "first_extractword $first_extractword\n";
700	print STAT "last_extractword $last_extractword\n";
701	print STAT "first_contentword $first_contentword\n";
702	print STAT "last_contentword $last_contentword\n";
703	print STAT "first_symbol $first_delimiter\n";
704	print STAT "last_symbol $last_contentword\n";
705	print STAT "first_word $first_stopword\n";
706	print STAT "last_word $last_contentword\n";
707	close STAT;
708
709	undef @vocab;
710
711
712	# Save text as symbol numbers
713	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
714
715	open(TXT, "<$phindex_dir/clauses");
716	open(NUM, ">$phindex_dir/clauses.numbers");
717
718	$phrasedelimiter = $symbol{lc($senlimit)};
719	print NUM "$symbol{lc($colstart)}\n";
720
721	# set up the special symbols that delimit documents and sentences
722	while(<TXT>) {
723
724	# split sentence into a list of tokens
725	$line = $_;
726	next unless ($line =~ /./);
727	@words = split(/\s+/, $line);
728
729	# output one token at a time
730	foreach $word (@words) {
731	$word = lc($word);
732	print NUM "$symbol{$word}\n";
733	}
734
735	# output phrase delimiter
736	print NUM "$phrasedelimiter\n";
737	}
738
739	print NUM "$symbol{lc($colend)}\n";
740
741	}
742
743
744	# Prepare the phrases file to be input to mgpp.
745	# This means renumbering the phrases in order of decreasing frequency.
746
747
748	# This is legacy code, and a little ugly, and may be unix-specific
749	# (particularly the sort command).
750
751	sub renumber_phrases {
752	my ($phindex_dir, $verbosity) = @_;
753
754	# Sort the phrases into order of increasing frequency
755	# This means the expansions will be sorted correctly later on.
756	print "Sorting phrases into freq order\n" if ($verbosity);
757	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
758
759	my @symbol;
760
761	# Read the vocabulary
762	print "Reading the vocabulary\n" if ($verbosity);
763	open(V, "<$phindex_dir/clauses.vocab")
764	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
765
766	my $i = 1;
767	while(<V>) {
768	chomp;
769	$symbol[$i++] = $_;
770	}
771
772	# Create file for phrase data
773	#
774	# The phrases file looks something like this
775	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
776	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
777	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
778	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
779
780	# The first field on each line is a unique phrase identifier.
781	# We need to calculate phrase numbers for each phrase
782	print "Calculate phrase numbers\n" if ($verbosity);
783
784	my %phrasenumber;
785	my $nextphrase = 1;
786	my ($line);
787
788	open(IN, "<$phindex_dir/phrases");
789	while(<IN>) {
790
791	# read the line
792	chomp;
793	$line = $_;
794
795	# we're only interested in the first field
796	$line =~ s/:.*//;
797
798	# get a phrase number for this line
799	$phrasenumber{$line} = $nextphrase;
800	$nextphrase++;
801	}
802
803
804	# Now we create a new phrase file using phrase numbers, not the old IDs.
805	print "Format phrase data for MGPP\n" if ($verbosity);
806
807	open(IN, "<$phindex_dir/phrases");
808	open(DATA, ">$phindex_dir/pdata.txt");
809	open(IDX, ">$phindex_dir/pword.txt");
810
811	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
812	my @fields;
813	my @documents;
814	my (@newexp, $k, $n);
815
816	my $linenumber = 0;
817
818	while(<IN>) {
819
820	# read the line
821	chomp;
822	$line = $_;
823	@fields = split(/:/, $line);
824
825	# get a phrase number for this line
826	$key = shift @fields;
827	die unless (defined($phrasenumber{$key}));
828	$num = $phrasenumber{$key};
829
830	# get the text of the phrase
831	$text = shift @fields;
832	$text =~ s/s(\d+)/$symbol[$1]/g;
833	if ($text =~ / /) {
834	$word = "";
835	} else {
836	$word = $text;
837	}
838
839	$linenumber++;
840	if ($linenumber % 1000 == 0) {
841	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
842	}
843	print "$num: $key\t($text)\n" if ($verbosity > 3);
844
845	# get the phrase frequency
846	$tf = shift @fields;
847
848	# get the number of expansions
849	$countexp = shift @fields;
850
851	# get the expansions and convert them into phrase numbers
852	$expansions = shift @fields;
853	@newexp = ();
854	foreach $k (split(/,/, $expansions)) {
855	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
856	$n = $phrasenumber{$k};
857	push @newexp, $n;
858	}
859	@newexp = sort numerically @newexp;
860
861	# get the number of documents
862	$countdocs = shift @fields;
863
864	# get the documents
865	$documents = shift @fields;
866	$documents =~ s/d//g;
867	@documents = split(/;/, $documents);
868	@documents = sort by_frequency @documents;
869
870	# output the phrase data
871	print DATA "<Document>";
872	print DATA "$num:$text:$tf:$countexp:$countdocs:";
873	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
874
875	# output the word index search data
876	print IDX "<Document>$word\n";
877
878
879	}
880	}
881
882	# sort routines used to renumber phrases
883
884	sub numerically { $a <=> $b }
885
886	sub by_frequency {
887	my $fa = 1;
888	if ($a =~ /,/) {
889	$fa = $a;
890	$fa =~ s/\d+,//;
891	}
892	my $fb = 1;
893	if ($b =~ /,/) {
894	$fb = $b;
895	$fb =~ s/\d+,//;
896	}
897
898	return ($fb <=> $fa);
899	}
900
901
902	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: