Context Navigation

source: main/tags/2.30/gsdl/perllib/classify/phind.pm@ 23841

Last change on this file since 23841 was 1808, checked in by paynter, 24 years ago
Option to save the phind phrases to a text file.
Property svn:keywords set to `Author Date Id Revision`
File size: 27.6 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin
30	#
31	# options are:
32	# button=Name The label for the classifiers button in the
33	# navigation bar (defaults to "Phrase").
34	# title=Title The metadata field used to describe each document
35	# (defaults to "Title").
36	# text=fields The text used to build the phrase hierarchy
37	# (defaults to "section:Title,section:text").
38	# phinddir=directory Location of phind index files
39	# verbosity=num Control amount of output
40	# untidy=true Do not clean up intermediate files
41	# suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword)
42	# suffixsize=num Number of symbols available to suffix program
43	# savephrases=filename If set, phrase infomation will be stored in filename
44	# as text. (By defualt, it is not set.)
45
46	# How a classifier works.
47	#
48	# For each classifier requested in the collect.cfg file, buildcol.pl creates
49	# a new classifier object (such as the one defined in theis file) and later
50	# passes each document object to the classifier in turn for classification.
51	#
52	# Four functions are used:
53	#
54	# 1. "new" is called before the documents are processed to set up the
55	# classifier.
56	#
57	# 2. "init" is called after buildcol.pl has created the indexes etc but
58	# before the documents are classified in order that the classifier might
59	# set any variables it requires, etc.
60	#
61	# 3. "classify" is called once for each document object. The classifier
62	# "classifies" each document and updates its local data accordingly.
63	#
64	# 4. "get_classify_info" is called after every document has been
65	# classified. It collates the information about the documents and
66	# stores a reference to the classifier so that Greenstone can later
67	# display it.
68
69
70	package phind;
71
72	use BasClas;
73	use util;
74
75	sub BEGIN {
76	@ISA = ('BasClas');
77	}
78
79	# Define delimiter symbols - this should be abstracted out someplace
80	my $colstart = "COLLECTIONSTART";
81	my $colend = "COLLECTIONEND";
82	my $doclimit = "DOCUMENTLIMIT";
83	my $senlimit = "SENTENCELIMIT";
84	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
85
86	# Create a new phind browser based on the options in collect.cfg
87
88	sub new {
89	my ($class, @options) = @_;
90	my $self = new BasClas ($class, @_);
91	my $out = pop @options;
92
93	# Phind installation check
94	# The phind phrase browser is research software and is not installed
95	# by defualt. If the user attepts to use it we warn them that it's a
96	# bit dodgy, then tell them how to install it. If they can do that
97	# and get all the files in place, then we let them proceed.
98
99	print $out "The Phind classifier for Greenstone.\n";
100	print $out "Checking the phind phrase browser requirements...\n";
101
102	# Make sure we're not in windows
103	if ($ENV{'GSDLOS'} =~ /windows/i) {
104	print STDERR "Phind currently only works under Unix";
105	exit(1);
106	}
107
108	# Ensure the Phind generate scripts are in place
109	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
110	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
111
112	if (!(-e $file1)) {
113	print STDERR "The phind \"suffix\" program is not installed. ";
114	print STDERR "To install it, change to the directory\n";
115	print STDERR " $src\n";
116	print STDERR "and type \"make install-phind\".\n\n";
117	exit(1);
118	}
119
120	# Ensure the Phind CGI script is in place
121	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
122	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
123
124	if (!(-e $file1)) {
125	print STDERR "The phind CGI program is not installed. ";
126	print STDERR "To install it, change to the directory\n";
127	print STDERR " $src\n";
128	print STDERR "and type \"make install-phind\".\n\n";
129	exit(1);
130	}
131
132	# Ensure the Phind Java applet is in place
133	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
134	$file1 = &util::filename_cat($src, "Phind.class");
135
136	if (!(-e $file1)) {
137	print STDERR "The phind Java classes are not compiled. ";
138	print STDERR "To compile them, change to the directory\n";
139	print STDERR " $src\n";
140	print STDERR "and use your Java compiler to compile Phind.java.\n";
141	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
142	exit(1);
143	}
144
145	# The installation appears OK - set up the classifier
146	my $collection = $ENV{'GSDLCOLLECTION'};
147	my $language = "english";
148
149	my $button = "Phrase";
150	my $title = "Title";
151	my $indexes = "section:Title,section:text";
152
153	my $builddir = "";
154	my $phinddir = "";
155
156	my $suffixmode = 1;
157	my $suffixsize = 40000000;
158	my $savephrases = "";
159
160	my $verbosity = 2;
161	my $untidy = 0;
162
163	# parse the options
164	foreach $option (@options) {
165
166	print STDERR "option: $option\n";
167
168	if ($option =~ /^text=(.*)$/i) {
169	$indexes = $1;
170	} elsif ($option =~ /^title=(.*)$/i) {
171	$title = $1;
172	} elsif ($option =~ /^button=(.*)$/i) {
173	$button = $1;
174	} elsif ($option =~ /^builddir=(.*)$/i) {
175	$builddir = $1;
176	} elsif ($option =~ /^phinddir=(.*)$/i) {
177	$phinddir = $1;
178	} elsif ($option =~ /^savephrases=(.*)$/i) {
179	$savephrases = $1;
180	} elsif ($option =~ /^suffixsize=(.*)$/i) {
181	$suffixsize = $1;
182	} elsif ($option =~ /^suffixmode=(.*)$/i) {
183	$suffixmode = $1;
184	} elsif ($option =~ /^untidy/i) {
185	$untidy = 1;
186	} elsif ($option =~ /^verbosity=(.*)$/i) {
187	$verbosity = $1;
188	}
189	}
190
191	# classifier information
192	$self->{'collection'} = $collection;
193	$self->{'titlefield'} = $title;
194	$self->{'buttonname'} = $button;
195	$self->{'indexes'} = $indexes;
196
197	# phrase extraction options
198	$self->{'suffixmode'} = $suffixmode;
199	$self->{'suffixsize'} = $suffixsize;
200	$self->{'savephrases'} = $savephrases if ($savephrases);
201
202	# limit languages
203	$language =~ s/,/\\|/g;
204	$self->{'language_exp'} = $language;
205	$self->{'delimiter'} = $delimiter;
206
207	# build directory
208	if (!$builddir) {
209	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
210	}
211	$self->{'builddir'} = $builddir;
212
213	# phind directory
214	if (!$phinddir) {
215	$phinddir = &util::filename_cat($builddir, "phind");
216	}
217	$self->{'phinddir'} = $phinddir;
218
219	# debugging levels
220	$self->{'verbosity'} = $verbosity;
221	$self->{'untidy'} = $untidy;
222
223	return bless $self, $class;
224	}
225
226
227	# Initialise the phind classifier
228
229	sub init {
230	my $self = shift (@_);
231
232	# ensure we have a build directory
233	my $builddir = $self->{'builddir'};
234	die unless (-e "$builddir");
235
236	# create phind directory
237	my $phinddir = $self->{'phinddir'};
238	if (-e "$phinddir") {
239	&util::rm_r("$phinddir");
240	}
241	&util::mk_dir("$phinddir");
242
243	# open filehandles for documents and text
244	my $clausefile = &util::filename_cat("$phinddir", "clauses");
245	&util::rm($clausefile) if (-e $clausefile);
246	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
247	$self->{'txthandle'} = TEXT;
248
249	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
250	&util::rm($docfile) if (-e $docfile);
251	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
252	$self->{'dochandle'} = DOCS;
253
254	}
255
256	# Classify each document.
257	#
258	# Each document is passed here in turn. The classifier extracts the
259	# text of each and stores it in the clauses file. Document details are
260	# stored in the docs.txt file.
261
262	sub classify {
263	my $self = shift (@_);
264	my ($doc_obj) = @_;
265
266	my $verbosity = $self->{'verbosity'};
267	my $top_section = $doc_obj->get_top_section();
268
269	my $titlefield = $self->{'titlefield'};
270
271	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
272	print "process: $title\n" if ($verbosity > 2);
273
274
275	# only consider english-language files
276	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
277	my $phrlanguage = $self->{'language_exp'};
278	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
279
280	# record this file
281	my $total++;
282	print "file $total: $file\n" if ($self->{'$verbosity'});
283
284
285	# Store document details
286	my $OID = $doc_obj->get_OID();
287	$OID = "NULL" unless defined $OID;
288	my $dochandle = $self->{'dochandle'};
289	print $dochandle "<Document>\t$OID\t$title\n";
290
291	# Store the text occuring in this object
292
293	# output the document delimiter
294	my $txthandle = $self->{'txthandle'};
295	print $txthandle "$doclimit\n";
296
297	# iterarate over the required indexes and store their text
298	my $indexes = $self->{'indexes'};
299	my $text = "";
300	my ($part, $level, $field, $section, $data);
301
302	foreach $part (split(/,/, $indexes)) {
303
304	# Each field has a level and a data element ((e.g. document:Title)
305	($level, $field) = split(/:/, $part);
306	die unless ($level && $field);
307
308	# Extract the text from every section
309	# (In phind, document:text and section:text are equivalent)
310	if ($field eq "text") {
311	$data = "";
312	$section = $doc_obj->get_top_section();
313	while (defined($section)) {
314	$data .= $doc_obj->get_text($section) . "\n";
315	$section = $doc_obj->get_next_section($section);
316	}
317	$text .= convert_gml_to_tokens($data) . "\n";
318	}
319
320	# Extract a metadata field from a document
321	elsif ($level eq "document") {
322	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
323	$text .= convert_gml_to_tokens($data) . "\n";
324	}
325
326	# Extract metadata from every section in a document
327	elsif ($level eq "section") {
328	$data = "";
329	$section = $doc_obj->get_top_section();
330	while (defined($section)) {
331	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
332	$section = $doc_obj->get_next_section($section);
333	}
334	$text .= convert_gml_to_tokens($data) . "\n";
335	}
336
337	# Some sort of specification which I don't understand
338	else {
339	die "Unknown level ($level) in phind index ($part)\n";
340	}
341
342	}
343
344	# output the text
345	$text =~ tr/\n//s;
346	print $txthandle "$text";
347
348	}
349
350
351
352	# Construct the classifier from the information already gathered
353	#
354	# When get_classify_info is called, the clauses and docs.txt files have
355	# already been constructed in the phind directory. This function will
356	# translate them into compressed, indexed MGPP files that can be read by
357	# the phindcgi script. It will also register our classifier so that it
358	# shows up in the navigation bar.
359
360	sub get_classify_info {
361	my $self = shift (@_);
362
363	my $verbosity = $self->{'verbosity'};
364	my $phinddir = $self->{'phinddir'};
365	my $language = "english";
366
367	if ($verbosity) {
368	print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
369	}
370
371	# Construct phind indexes
372	my $suffixmode = $self->{'suffixmode'};
373	my $suffixsize = $self->{'suffixsize'};
374	my ($command, $status);
375
376	# Generate the vocabulary, symbol statistics, and numbers file
377	# from the clauses file
378	print "\nExtracting vocabulary and statistics\n" if $verbosity;
379	&extract_vocabulary($phinddir, $language, $verbosity);
380
381	# Use the suffix program to generate the phind/phrases file
382	print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
383	&execute("suffix $phinddir $suffixsize $suffixmode", $verbosity);
384
385	# Create the phrase file and put phrase numbers in phind/phrases
386	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
387	&renumber_phrases($self);
388
389	# Create the mg phrase database
390	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
391	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
392	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
393
394	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
395	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
396	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
397	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
398
399	print "\nCreating phrase databases\n";
400	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
401	my $mg_stem = "pdata";
402
403	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity);
404	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
405	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity);
406
407	# create the mg index of words
408	print "\nCreating word-level search indexes\n";
409	$mg_input = &util::filename_cat($phinddir, "pword.txt");
410	$mg_stem = "pword";
411
412	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
413	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
414	&execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity);
415	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
416	&execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity);
417	&execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity);
418
419	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity);
420	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity);
421	&execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity);
422
423	# create the mg document information database
424	print "\nCreating document information databases\n";
425	$mg_input = &util::filename_cat($phinddir, "docs.txt");
426	$mg_stem = "docs";
427
428	&execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity);
429	&execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
430	&execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity);
431
432
433	# Tidy up stray files
434	if (!$self->{'untidy'}) {
435	print "\nCleaning up\n" if ($verbosity > 2);
436	&util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
437	"$phinddir/clauses.vocab", "$phinddir/clauses.stats",
438	"$phinddir/phrases", "$phinddir/docs.txt",
439	"$phinddir/pdata.txt", "$phinddir/pword.txt");
440	my $outfile = 1;
441	while (-e "$phinddir/outPhrase.$outfile") {
442	&util::rm("$phinddir/outPhrase.$outfile");
443	$outfile++;
444	}
445	}
446
447
448	# Insert the classifier into.... what?
449	my $collection = $self->{'collection'};
450	my $url = "library?a=p&p=phind&c=$collection";
451
452	my %classifyinfo = ('thistype'=>'Invisible',
453	'childtype'=>'Phind',
454	'Title'=>$self->{'buttonname'},
455	'contains'=>[]);
456
457	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
458	return \%classifyinfo;
459	}
460
461
462
463	sub convert_gml_to_tokens {
464
465	$_ = shift @_;
466
467	# FIRST, remove GML tags
468
469	# Replace all whitespace with a simple space
470	s/\s+/ /gs;
471
472	# Remove everything that is in a tag
473	s/\s<p>\s/ PARAGRAPHBREAK /isg;
474	s/\s<br>\s/ LINEBREAK /isg;
475	s/<[^>]*>/ /sg;
476
477	# Now we have the text, but it may contain HTML
478	# elements coded as > etc. Remove these tags.
479	s/</</sg;
480	s/>/>/sg;
481
482	s/\s+/ /sg;
483	s/\s<p>\s/ PARAGRAPHBREAK /isg;
484	s/\s<br>\s/ LINEBREAK /isg;
485	s/<[^>]*>/ /sg;
486
487	# remove & and other miscellaneous markup tags
488	s/&/&/sg;
489	s/</</sg;
490	s/>/>/sg;
491	s/&/&/sg;
492
493	# replace<p> and <br> placeholders with carriage returns
494	s/PARAGRAPHBREAK/\n/sg;
495	s/LINEBREAK/\n/sg;
496
497
498	# Exceptional punctuation
499	#
500	# We make special cases of some punctuation
501
502	# remove any apostrophe that indicates omitted letters
503	s/(\w+)\'(\w*\s)/ $1$2 /g;
504
505	# remove period that appears in a person's initals
506	s/\s([A-Z])\./ $1 /g;
507
508	# replace hyphens in hypheanted words and names with a space
509	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
510
511
512	# Convert the remaining text to "clause format",
513	# This means removing all excess punctuation and garbage text,
514	# normalising valid punctuation to fullstops and commas,
515	# then putting one cluse on each line.
516
517	# Insert newline when the end of a sentence is detected
518	# (delimter is: "[\.\?\!]\s")
519	s/\s*[\.\?\!]\s+/\n/g;
520
521	# split numbers after four digits
522	s/(\d\d\d\d)/$1 /g;
523
524	# split words after 32 characters
525
526	# squash repeated punctuation
527	tr/A-Za-z0-9 //cs;
528
529	# save email addresses
530	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
531
532	# normalise clause breaks (mostly punctuation symbols) to commas
533	s/[^A-Za-z0-9 \n]+/ , /g;
534
535	# Remove repeated commas, and replace with newline
536	s/\s*,[, ]+/\n/g;
537
538	# remove extra whitespace
539	s/ +/ /sg;
540	s/^\s+//mg;
541	s/\s*$/\n/mg;
542
543	# remove lines that contain one word or less
544	s/^\w*$//mg;
545	s/^\s*$//mg;
546	tr/\n//s;
547
548	return $_;
549	}
550
551
552	# Execute a system command
553
554	sub execute {
555	my ($command, $verbosity) = @_;
556	print "Executing: $command\n" if ($verbosity > 2);
557	my $status = system($command);
558	if ($status != 0) {
559	print STDERR "phindgen.pl - Error executing $command: $!\n";
560	exit($status);
561	}
562	}
563
564
565	# Generate the vocabulary, symbol statistics, and numbers file from the
566	# clauses file. This is legacy code, so is a bit messy and probably wont
567	# run under windows.
568
569	sub extract_vocabulary {
570	my ($phind_dir, $language, $verbosity) = @_;
571
572	my ($w, $l, $line, $word);
573
574	my ($first_delimiter, $last_delimiter,
575	$first_stopword, $last_stopword,
576	$first_extractword, $last_extractword,
577	$first_contentword, $last_contentword,
578	$phrasedelimiter);
579
580	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
581
582
583	my %symbol;
584	my (%freq);
585
586	print "Calculating vocabulary\n" if ($verbosity > 1);
587
588	# Read and store the stopwords
589	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
590	my %stopwords;
591	foreach my $w (split(/\s+/, $words)) {
592	$l = lc($w);
593	$stopwords{$l} = $w;
594	}
595
596	# Read and store the thesaurus terms
597	$use_thesaurus = 0;
598	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
599	"$language", "agrovoc.lex");
600	if (-e "$lex_file") {
601	open(TH, "<$lex_file");
602	while(<TH>) {
603	s/^\d+ //;
604	s/\(.*\)//;
605	foreach my $w (split(/\s+/, $_)) {
606	$thesaurus{lc($w)} = $w;
607	}
608	}
609	close TH;
610	$use_thesaurus = 1;
611	}
612
613	# Read words in the text and count occurences
614	open(TXT, "<$phind_dir/clauses");
615	my @words;
616
617	while(<TXT>) {
618	$line = $_;
619	next unless ($line =~ /./);
620
621	@words = split(/\s+/, $line);
622	foreach $w (@words) {
623	$l = lc($w);
624	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
625	$freq{$w}++;
626	}
627	$freq{$senlimit}++;
628	}
629
630	# Calculate the "best" form of each word
631	my (%bestform, %totalfreq, %bestfreq);
632
633	foreach $w (sort (keys %freq)) {
634	$l = lc($w);
635
636	# totalfreq is the number of times a term appears in any form
637	$totalfreq{$l} += $freq{$w};
638
639	if (defined $stopwords{$l}) {
640	$bestform{$l} = $stopwords{$l};
641
642	} elsif (defined $thesaurus{$l}) {
643	$bestform{$l} = $thesaurus{$l};
644
645	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
646	$bestfreq{$l} = $freq{$w};
647	$bestform{$l} = $w;
648	}
649	}
650
651	undef %freq;
652	undef %bestfreq;
653
654
655	# Assign symbol numbers to tokens
656	my $nextsymbol = 1;
657	my (@vocab);
658
659	# Delimiters
660	$first_delimiter = 1;
661
662	foreach $word (@delimiters) {
663
664	$word = lc($word);
665	$bestform{$word} = uc($word);
666	$vocab[$nextsymbol] = $word;
667	$symbol{$word} = $nextsymbol;
668	$nextsymbol++;
669	}
670	$last_delimiter = $nextsymbol - 1;
671
672	# Stopwords
673	$first_stopword = $nextsymbol;
674
675	foreach my $word (sort keys %stopwords) {
676
677	# don't incluse stopword unless it occurs in the text
678	$word = lc($word);
679	next unless ($totalfreq{$word});
680	next if ($symbol{$word});
681
682	$vocab[$nextsymbol] = $word;
683	$symbol{$word} = $nextsymbol;
684	$nextsymbol++;
685	}
686	$last_stopword = $nextsymbol - 1;
687	$first_contentword = $nextsymbol;
688
689	# Thesaurus terms
690	if ($use_thesaurus) {
691	$first_thesaurusword = $nextsymbol;
692
693	foreach my $word (sort keys %thesaurus) {
694
695	$word = lc($word);
696	next if ($symbol{$word});
697	$bestform{$word} = $thesaurus{$word};
698
699	$vocab[$nextsymbol] = $word;
700	$symbol{$word} = $nextsymbol;
701	$nextsymbol++;
702
703	}
704	$last_thesaurusword = $nextsymbol - 1;
705	}
706
707	# Other content words
708	$first_extractword = $nextsymbol;
709
710	foreach my $word (sort (keys %bestform)) {
711
712	next if ($symbol{$word});
713
714	$vocab[$nextsymbol] = $word;
715	$symbol{$word} = $nextsymbol;
716	$nextsymbol++;
717	}
718	$last_extractword = $nextsymbol - 1;
719	$last_contentword = $nextsymbol - 1;
720
721
722	# Outut the words
723	print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1);
724	open(VOC, ">$phind_dir/clauses.vocab");
725
726	for (my $i = 1; $i < $nextsymbol; $i++) {
727	$w = $vocab[$i];
728
729	print VOC "$bestform{$w}\n";
730	$totalfreq{$w} = 0 unless ($totalfreq{$w});
731	}
732	close VOC;
733
734
735	# Output statistics about the vocablary
736	print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1);
737	&util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats");
738	open(STAT, ">$phind_dir/clauses.stats")
739	\|\| die "Cannot open $phind_dir/clauses.stats: $!";
740
741	print STAT "first_delimiter $first_delimiter\n";
742	print STAT "last_delimiter $last_delimiter\n";
743	print STAT "first_stopword $first_stopword\n";
744	print STAT "last_stopword $last_stopword\n";
745	if ($use_thesaurus) {
746	print STAT "first_thesaurusword $first_thesaurusword\n";
747	print STAT "last_thesaurusword $last_thesaurusword\n";
748	}
749	print STAT "first_extractword $first_extractword\n";
750	print STAT "last_extractword $last_extractword\n";
751	print STAT "first_contentword $first_contentword\n";
752	print STAT "last_contentword $last_contentword\n";
753	print STAT "first_symbol $first_delimiter\n";
754	print STAT "last_symbol $last_contentword\n";
755	print STAT "first_word $first_stopword\n";
756	print STAT "last_word $last_contentword\n";
757	close STAT;
758
759	undef @vocab;
760
761
762	# Save text as symbol numbers
763	print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1);
764
765	open(TXT, "<$phind_dir/clauses");
766	open(NUM, ">$phind_dir/clauses.numbers");
767
768	$phrasedelimiter = $symbol{lc($senlimit)};
769	print NUM "$symbol{lc($colstart)}\n";
770
771	# set up the special symbols that delimit documents and sentences
772	while(<TXT>) {
773
774	# split sentence into a list of tokens
775	$line = $_;
776	next unless ($line =~ /./);
777	@words = split(/\s+/, $line);
778
779	# output one token at a time
780	foreach $word (@words) {
781	$word = lc($word);
782	print NUM "$symbol{$word}\n";
783	}
784
785	# output phrase delimiter
786	print NUM "$phrasedelimiter\n";
787	}
788
789	print NUM "$symbol{lc($colend)}\n";
790
791	}
792
793
794	# Prepare the phrases file to be input to mgpp.
795	# This means renumbering the phrases in order of decreasing frequency.
796
797
798	# This is legacy code, and a little ugly, and may be unix-specific
799	# (particularly the sort command).
800
801	sub renumber_phrases {
802	my $self = shift (@_);
803
804	my $verbosity = $self->{'verbosity'};
805	my $phind_dir = $self->{'phinddir'};
806
807	my $savephrases = 0;
808	$savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'}));
809
810
811
812	# Sort the phrases into order of increasing frequency
813	# This means the expansions will be sorted correctly later on.
814	print "Sorting phrases into freq order\n" if ($verbosity);
815	system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases");
816
817	# Read the vocabulary
818	my @symbol;
819	print "Reading the vocabulary\n" if ($verbosity);
820	open(V, "<$phind_dir/clauses.vocab")
821	\|\| die "Cannot open $phind_dir/clauses.vocab: $!";
822
823	my $i = 1;
824	while(<V>) {
825	chomp;
826	$symbol[$i++] = $_;
827	}
828
829	# Create file for phrase data
830	#
831	# The phrases file looks something like this
832	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
833	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
834	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
835	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
836
837	# The first field on each line is a unique phrase identifier.
838	# We need to calculate phrase numbers for each phrase
839	print "Calculate phrase numbers\n" if ($verbosity);
840
841	my %phrasenumber;
842	my $nextphrase = 1;
843	my ($line);
844
845	open(IN, "<$phind_dir/phrases");
846	while(<IN>) {
847
848	# read the line
849	chomp;
850	$line = $_;
851
852	# we're only interested in the first field
853	$line =~ s/:.*//;
854
855	# get a phrase number for this line
856	$phrasenumber{$line} = $nextphrase;
857	$nextphrase++;
858	}
859
860
861	# Now we create a new phrase file using phrase numbers, not the old IDs.
862	print "Format phrase data for MGPP\n" if ($verbosity);
863
864	# Open the basic files
865	open(IN, "<$phind_dir/phrases");
866	open(DATA, ">$phind_dir/pdata.txt");
867	open(IDX, ">$phind_dir/pword.txt");
868
869	# We may want to save the phrases in a separate text file
870	if ($savephrases) {
871	print "Saving phrases in $savephrases\n" if ($verbosity);
872	open(SAVE, ">$savephrases");
873	}
874
875	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
876	my @fields;
877	my @documents;
878	my (@newexp, $k, $n);
879
880	my $linenumber = 0;
881
882	while(<IN>) {
883
884	# read the line
885	chomp;
886	$line = $_;
887	@fields = split(/:/, $line);
888
889	# get a phrase number for this line
890	$key = shift @fields;
891	die unless (defined($phrasenumber{$key}));
892	$num = $phrasenumber{$key};
893
894	# get the text of the phrase
895	$text = shift @fields;
896	$text =~ s/s(\d+)/$symbol[$1]/g;
897	if ($text =~ / /) {
898	$word = "";
899	} else {
900	$word = $text;
901	}
902
903	$linenumber++;
904	if ($linenumber % 1000 == 0) {
905	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
906	}
907	print "$num: $key\t($text)\n" if ($verbosity > 3);
908
909	# get the phrase frequency
910	$tf = shift @fields;
911
912	# get the number of expansions
913	$countexp = shift @fields;
914
915	# get the expansions and convert them into phrase numbers
916	$expansions = shift @fields;
917	@newexp = ();
918	foreach $k (split(/,/, $expansions)) {
919	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
920	$n = $phrasenumber{$k};
921	push @newexp, $n;
922	}
923	@newexp = sort numerically @newexp;
924
925	# get the number of documents
926	$countdocs = shift @fields;
927
928	# get the documents
929	$documents = shift @fields;
930	$documents =~ s/d//g;
931	@documents = split(/;/, $documents);
932	@documents = sort by_frequency @documents;
933
934	# output the phrase data
935	print DATA "<Document>";
936	print DATA "$num:$text:$tf:$countexp:$countdocs:";
937	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
938
939	# output the word index search data
940	print IDX "<Document>$word\n";
941
942	# output the phrases to a text file
943	if ($savephrases) {
944	print SAVE "$tf\t$countdocs\t$text\n";
945	}
946
947	}
948
949	close SAVE if ($savephrases);
950	}
951
952	# sort routines used to renumber phrases
953
954	sub numerically { $a <=> $b }
955
956	sub by_frequency {
957	my $fa = 1;
958	if ($a =~ /,/) {
959	$fa = $a;
960	$fa =~ s/\d+,//;
961	}
962	my $fb = 1;
963	if ($b =~ /,/) {
964	$fb = $b;
965	$fb =~ s/\d+,//;
966	}
967
968	return ($fb <=> $fa);
969	}
970
971
972	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: