Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 2811

Last change on this file since 2811 was 2803, checked in by sjboddie, 23 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 38.5 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl phind" at the command line for a summary.
32
33	package phind;
34
35	use BasClas;
36	use util;
37	use ghtml;
38	use unicode;
39
40	my @removedirs = ();
41
42	my %wanted_index_files = ('td'=>1,
43	't'=>1,
44	'ti'=>1,
45	'tl'=>1,
46	'tsd'=>1,
47	'idb'=>1,
48	'ib1'=>1,
49	'ib2'=>1,
50	'ib3'=>1,
51	'i'=>1,
52	'il'=>1,
53	'w'=>1,
54	'wa'=>1);
55
56	sub BEGIN {
57	@ISA = ('BasClas');
58	}
59
60	sub END {
61
62	# Tidy up stray files - we do this here as there's some weird problem
63	# preventing us from doing it in the get_classify_info() function (on
64	# windows at least) where the close() appears to fail on txthandle and
65	# dochandle, thus preventing us from deleting those files
66
67	foreach my $dir (@removedirs) {
68	if (-d $dir && opendir (DIR, $dir)) {
69	my @files = readdir DIR;
70	closedir DIR;
71
72	foreach $file (@files) {
73	next if $file =~ /^\.\.?$/;
74	my ($suffix) = $file =~ /\.([^\.]+)$/;
75	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
76	# delete it!
77	&util::rm (&util::filename_cat ($dir, $file));
78	}
79	}
80	}
81	}
82	}
83
84	sub print_usage {
85	print STDERR "
86	usage: classify phind [options]
87
88	options:
89	-text Fields The text used to build the phrase hierarchy.
90	(default: 'section:Title,section:text')
91
92	-title Title The metadata field used to describe each document.
93	(default: 'Title')
94
95	-button Name The label for the classifier screen and button in
96	navigation bar.
97	(default: 'Phrase')
98
99	-language Regex Language or languages to use building hierarchy.
100	Languages are identified by two-letter country codes
101	like en (English), es (Spanish), and fr (French).
102	Language is a regular expression, so 'en\|fr' (English or
103	French) and '..' (match any language) are valid.
104	(default: 'en'.)
105
106	-savephrases File If set, the phrase infomation will be stored in
107	the given file as text. It is probably a good idea
108	to use an absolute path.
109	(default: not set)
110
111	-suffixmode N The smode parameter to the phrase extraction program. A
112	value of 0 means that stopwords are ignored, and of 1
113	means that stopwords are used.
114	(default: 1)
115
116	-thesaurus Name Name of a thesaurus stored in phind format in the
117	collection's etc directory.
118	(default: not set)
119
120	-untidy Don't remove working files.
121
122	"; }
123
124	# Phrase delimiter symbols - these should be abstracted out someplace
125
126	my $colstart = "COLLECTIONSTART";
127	my $colend = "COLLECTIONEND";
128	my $doclimit = "DOCUMENTLIMIT";
129	my $senlimit = "SENTENCELIMIT";
130	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
131
132
133	# Create a new phind browser based on collect.cfg
134
135	sub new {
136	my $class = shift (@_);
137	my $self = new BasClas($class, @_);
138
139	my $out = $self->{'outhandle'};
140
141	# Ensure the Phind generate scripts are in place
142	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
143	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
145
146	if (!(-e $file1)) {
147	print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
148	exit(1);
149	}
150
151	# Parse classifier arguments
152	my $builddir = "";
153	if (!parsargv::parse(\@_,
154	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155	q^title/.*/Title^, \$self->{'titlefield'},
156	q^button/.*/Phrase^, \$self->{'buttonname'},
157	q^language/.*/en^, \$language,
158	q^builddir/.*/^, \$builddir,
159	q^savephrases/.*/^, \$self->{'savephrases'},
160	q^suffixmode/\d/1^, \$self->{'suffixmode'},
161	q^thesaurus/.*/^, \$self->{'thesaurus'},
162	q^untidy^, \$self->{'untidy'},
163	"allow_extra_options")) {
164
165	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166	&print_usage();
167	die "\n";
168	}
169
170	# classifier information
171	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173	# limit languages
174	$self->{'language_exp'} = $language;
175
176	# collection directories
177	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178	if (!$builddir) {
179	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180	}
181	$self->{'builddir'} = $builddir;
182
183	$self->{'total'} = 0;
184
185	return bless $self, $class;
186	}
187
188
189	# Initialise the phind classifier
190
191	sub init {
192	my $self = shift (@_);
193
194	# ensure we have a build directory
195	my $builddir = $self->{'builddir'};
196	die unless (-e "$builddir");
197
198	# create phind directory
199	my $phnumber = 1;
200	my $phinddir = &util::filename_cat($builddir, "phind1");
201	while (-e "$phinddir") {
202	$phnumber++;
203	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
204	}
205	&util::mk_dir("$phinddir");
206	$self->{'phinddir'} = $phinddir;
207	$self->{'phindnumber'} = $phnumber;
208
209	push(@removedirs, $phinddir) unless $self->{'untidy'};
210
211	# open filehandles for documents and text
212	my $clausefile = &util::filename_cat("$phinddir", "clauses");
213	&util::rm($clausefile) if (-e $clausefile);
214
215	my $txthandle = 'TEXT' . $phnumber;
216	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
217	$self->{'txthandle'} = $txthandle;
218
219	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
220	&util::rm($docfile) if (-e $docfile);
221
222	my $dochandle = 'DOC' . $phnumber;
223	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
224	$self->{'dochandle'} = $dochandle;
225
226	}
227
228
229	# Classify each document.
230	#
231	# Each document is passed here in turn. The classifier extracts the
232	# text of each and stores it in the clauses file. Document details are
233	# stored in the docs.txt file.
234
235	sub classify {
236	my $self = shift (@_);
237	my $doc_obj = shift @_;
238
239	my $verbosity = $self->{'verbosity'};
240	my $top_section = $doc_obj->get_top_section();
241
242	my $titlefield = $self->{'titlefield'};
243
244	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
245	print "process: $title\n" if ($verbosity > 2);
246
247	# Only consider the file if it is in the correct language
248	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249	my $phrlanguage = $self->{'language_exp'};
250	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252	# record this file
253	$self->{'total'} ++;
254	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
255
256
257	# Store document details
258	my $OID = $doc_obj->get_OID();
259	$OID = "NULL" unless defined $OID;
260	my $dochandle = $self->{'dochandle'};
261	print $dochandle "<Document>\t$OID\t$title\n";
262
263	# Store the text occuring in this object
264
265	# output the document delimiter
266	my $txthandle = $self->{'txthandle'};
267	print $txthandle "$doclimit\n";
268
269	# iterarate over the required indexes and store their text
270	my $indexes = $self->{'indexes'};
271	my $text = "";
272	my ($part, $level, $field, $section, $data, $dataref);
273
274	foreach $part (split(/,/, $indexes)) {
275
276	# Each field has a level and a data element ((e.g. document:Title)
277	($level, $field) = split(/:/, $part);
278	die unless ($level && $field);
279
280	# Extract the text from every section
281	# (In phind, document:text and section:text are equivalent)
282	if ($field eq "text") {
283	$data = "";
284	$section = $doc_obj->get_top_section();
285	while (defined($section)) {
286	$data .= $doc_obj->get_text($section) . "\n";
287	$section = $doc_obj->get_next_section($section);
288	}
289	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
290	}
291
292	# Extract a metadata field from a document
293	# (If there is more than one element of the given type, get them all.)
294	elsif ($level eq "document") {
295	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
296	foreach $data (@$dataref) {
297	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
298	}
299	}
300
301	# Extract metadata from every section in a document
302	elsif ($level eq "section") {
303	$data = "";
304	$section = $doc_obj->get_top_section();
305	while (defined($section)) {
306	$dataref = $doc_obj->get_metadata($section, $field);
307	$data .= join("\n", @$dataref) . "\n";
308	$section = $doc_obj->get_next_section($section);
309	}
310	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311	}
312
313	# Some sort of specification which I don't understand
314	else {
315	die "Unknown level ($level) in phind index ($part)\n";
316	}
317
318	}
319
320	# output the text
321	$text =~ tr/\n//s;
322	print $txthandle "$text";
323	}
324
325
326	# Construct the classifier from the information already gathered
327	#
328	# When get_classify_info is called, the clauses and docs.txt files have
329	# already been constructed in the phind directory. This function will
330	# translate them into compressed, indexed MGPP files that can be read by
331	# the phindcgi script. It will also register our classifier so that it
332	# shows up in the navigation bar.
333
334	sub get_classify_info {
335	my $self = shift (@_);
336
337	close $self->{'dochandle'};
338	close $self->{'txthandle'};
339	my $verbosity = $self->{'verbosity'};
340	my $out = $self->{'outhandle'};
341	my $phinddir = $self->{'phinddir'};
342
343	my $osextra = "";
344	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
345	$osextra = " -d /";
346	}
347
348	if ($verbosity) {
349	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
350	print $out "*** in ", $self->{'phinddir'}, "\n";
351	}
352
353	# Construct phind indexes
354	my $suffixmode = $self->{'suffixmode'};
355	my ($command, $status);
356
357	# Generate the vocabulary, symbol statistics, and numbers file
358	# from the clauses file
359	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
360	&extract_vocabulary($self);
361
362	# Use the suffix program to generate the phind/phrases file
363	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
364	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
365
366	# Create the phrase file and put phrase numbers in phind/phrases
367	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
368	&renumber_phrases($self);
369
370	print $out "\nCreating phrase databases\n";
371	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
372	my $mg_stem = &util::filename_cat($phinddir, "pdata");
373
374	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
375	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
376	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
377
378	# create the mg index of words
379	print $out "\nCreating word-level search indexes\n";
380	$mg_input = &util::filename_cat($phinddir, "pword.txt");
381	$mg_stem = &util::filename_cat($phinddir, "pword");
382
383	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
384	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
385	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
386	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
387	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
388	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
389
390	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
391	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
392	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
393
394	# create the mg document information database
395	print $out "\nCreating document information databases\n";
396	$mg_input = &util::filename_cat($phinddir, "docs.txt");
397	$mg_stem = &util::filename_cat($phinddir, "docs");
398
399	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
400	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
401	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
402
403	# Return the information about the classifier that we'll later want to
404	# use to create macros when the Phind classifier document is displayed.
405	my %classifyinfo = ('thistype'=>'Invisible',
406	'childtype'=>'Phind',
407	'Title'=>$self->{'buttonname'},
408	'parameters'=>"phindnumber=$self->{'phindnumber'}",
409	'contains'=>[]);
410
411	my $collection = $self->{'collection'};
412	my $url = "library?a=p&p=phind&c=$collection";
413	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
414
415	return \%classifyinfo;
416	}
417
418
419
420	sub convert_gml_to_tokens {
421
422	my ($language_exp, $text) = @_;
423
424	# escape any magic words... - jrm21
425	foreach my $delim (@delimiters) {
426	my $replacement=lc($delim);
427	my $num= $text=~ s/$delim/$replacement/g;
428	if (!$num) {$num=0;}
429	}
430
431	if ($language_exp =~ /en/) {
432	return &convert_gml_to_tokens_EN($text);
433	}
434
435	$_ = $text;
436
437	# 1. remove GML tags
438
439	# Remove everything that is in a tag
440	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
441	s/\s<br>\s/ LINEBREAK /isgo;
442	s/<[^>]*>/ /sgo;
443
444	# Now we have the text, but it may contain HTML
445	# elements coded as > etc. Remove these tags.
446	s/&/&/sgo;
447	s/</</sgo;
448	s/>/>/sgo;
449	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
450	s/\s<br>\s/ LINEBREAK /isgo;
451	s/<[^>]*>/ /sgo;
452
453	# replace<p> and <br> placeholders with clause break symbol (\n)
454	s/\s+/ /gso;
455	s/PARAGRAPHBREAK/\n/sgo;
456	s/LINEBREAK/\n/sgo;
457
458
459
460
461	# 2. Split the remaining text into space-delimited tokens
462
463	# Convert any HTML special characters (like ") to their UTF8 equivalent
464	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
465
466	# Split text at word boundaries
467	s/\b/ /go;
468
469	# 3. Convert the remaining text to "clause format"
470
471	# Insert newline if the end of a sentence is detected
472	# (delimter is: "[\.\?\!]\s")
473	# s/\s*[\.\?\!]\s+/\n/go;
474
475	# remove unnecessary punctuation and replace with clause break symbol (\n)
476	s/[^\w ]/\n/go;
477
478	# remove extraneous whitespace
479	s/ +/ /sgo;
480	s/^\s+//mgo;
481	s/\s*$/\n/mgo;
482
483	# remove lines that contain one word or less
484	s/^\S*$//mgo;
485	s/^\s*$//mgo;
486	tr/\n//s;
487
488	return $_;
489	}
490
491	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
492
493	sub convert_gml_to_tokens_EN {
494	$_ = shift @_;
495
496	# FIRST, remove GML tags
497
498	# Replace all whitespace with a simple space
499	s/\s+/ /gs;
500
501	# Remove everything that is in a tag
502	s/\s<p>\s/ PARAGRAPHBREAK /isg;
503	s/\s<br>\s/ LINEBREAK /isg;
504	s/<[^>]*>/ /sg;
505
506	# Now we have the text, but it may contain HTML
507	# elements coded as > etc. Remove these tags.
508	s/</</sg;
509	s/>/>/sg;
510
511	s/\s+/ /sg;
512	s/\s<p>\s/ PARAGRAPHBREAK /isg;
513	s/\s<br>\s/ LINEBREAK /isg;
514	s/<[^>]*>/ /sg;
515
516	# remove & and other miscellaneous markup tags
517	s/&/&/sg;
518	s/</</sg;
519	s/>/>/sg;
520	s/&/&/sg;
521
522	# replace<p> and <br> placeholders with carriage returns
523	s/PARAGRAPHBREAK/\n/sg;
524	s/LINEBREAK/\n/sg;
525
526
527	# Exceptional punctuation
528	#
529	# We make special cases of some punctuation
530
531	# remove any apostrophe that indicates omitted letters
532	s/(\w+)\'(\w*\s)/ $1$2 /g;
533
534	# remove period that appears in a person's initals
535	s/\s([A-Z])\./ $1 /g;
536
537	# replace hyphens in hypheanted words and names with a space
538	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
539
540	# Convert the remaining text to "clause format",
541	# This means removing all excess punctuation and garbage text,
542	# normalising valid punctuation to fullstops and commas,
543	# then putting one cluse on each line.
544
545	# Insert newline when the end of a sentence is detected
546	# (delimter is: "[\.\?\!]\s")
547	s/\s*[\.\?\!]\s+/\n/g;
548
549	# split numbers after four digits
550	s/(\d\d\d\d)/$1 /g;
551
552	# split words after 32 characters
553
554	# squash repeated punctuation
555	tr/A-Za-z0-9 //cs;
556
557	# save email addresses
558	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
559
560	# normalise clause breaks (mostly punctuation symbols) to commas
561	s/[^A-Za-z0-9 \n]+/ , /g;
562
563	# Remove repeated commas, and replace with newline
564	s/\s*,[, ]+/\n/g;
565
566	# remove extra whitespace
567	s/ +/ /sg;
568	s/^\s+//mg;
569	s/\s*$/\n/mg;
570
571	# remove lines that contain one word or less
572	s/^\w*$//mg;
573	s/^\s*$//mg;
574	tr/\n//s;
575
576	return $_;
577
578	}
579
580
581
582	# Execute a system command
583
584	sub execute {
585	my ($command, $verbosity, $outhandle) = @_;
586	print $outhandle "Executing: $command\n" if ($verbosity > 2);
587	$! = 0;
588	my $status = system($command);
589	if ($status != 0) {
590	print STDERR "phind - Error executing '$command': $!\n";
591	exit($status);
592	}
593	}
594
595
596	# Generate the vocabulary, symbol statistics, and numbers file from the
597	# clauses file. This is legacy code, so is a bit messy and probably wont
598	# run under windows.
599
600	sub extract_vocabulary {
601	my ($self) = @_;
602
603	my $verbosity = $self->{'verbosity'};
604	my $out = $self->{'outhandle'};
605
606	my $collectiondir = $self->{'collectiondir'};
607	my $phinddir = $self->{'phinddir'};
608
609	my $language_exp = $self->{'language_exp'};
610
611	my ($w, $l, $line, $word);
612
613	my ($first_delimiter, $last_delimiter,
614	$first_stopword, $last_stopword,
615	$first_extractword, $last_extractword,
616	$first_contentword, $last_contentword,
617	$phrasedelimiter);
618
619	my $thesaurus = $self->{'thesaurus'};
620	my ($thesaurus_links, $thesaurus_terms,
621	%thesaurus, $first_thesaurusword, $last_thesaurusword);
622
623	my %symbol;
624	my (%freq);
625
626	print $out "Calculating vocabulary\n" if ($verbosity > 1);
627
628	# Read and store the stopwords
629	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
630	my $stopword_files = ();
631	my ($language, $language_dir, $file, $file_name);
632	my %stopwords;
633
634	# Examine each directory in the stopword directory
635	opendir(STOPDIR, $stopdir);
636	foreach $language (readdir STOPDIR) {
637
638	# Ignore entries that do not match the classifier's language
639	next unless ($language =~ /$language_exp/);
640	$language_dir = &util::filename_cat($stopdir, $language);
641	next unless (-d "$language_dir");
642
643	opendir(LANGDIR, $language_dir);
644	foreach $file (readdir LANGDIR) {
645
646	# Ignore entries that are not stopword files
647	next unless ($file =~ /sw$/);
648	$file_name = &util::filename_cat($language_dir, $file);
649	next unless (-f "$file_name");
650
651	# Read the stopwords
652	open(STOPFILE, "<$file_name");
653	while (<STOPFILE>) {
654	s/^\s+//;
655	s/\s.*//;
656	$word = $_;
657	$l = lc($word);
658	$stopwords{$l} = $word;
659	}
660	close STOPFILE;
661
662	}
663	closedir LANGDIR;
664	}
665	closedir STOPDIR;
666
667	# Read thesaurus information
668	if ($thesaurus) {
669
670	# link file exists
671	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
672	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
673
674	# ensure term file exists in the correct language
675	if ($language_exp =~ /^([a-z][a-z])/) {
676	$language = $1;
677	} else {
678	$language = 'en';
679	}
680	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
681	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
682
683
684	# Read the thesaurus terms
685	open(TH, "<$thesaurus_terms");
686	while(<TH>) {
687	s/^\d+ //;
688	s/\(.*\)//;
689	foreach $w (split(/\s+/, $_)) {
690	$thesaurus{lc($w)} = $w;
691	}
692	}
693	close TH;
694	}
695
696	# Read words in the text and count occurences
697	open(TXT, "<$phinddir/clauses");
698
699	my @words;
700	while(<TXT>) {
701	$line = $_;
702	next unless ($line =~ /./);
703
704	@words = split(/\s+/, $line);
705	foreach $w (@words) {
706	$l = lc($w);
707	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
708	$freq{$w}++;
709	}
710	$freq{$senlimit}++;
711	}
712
713	close TXT;
714
715	# Calculate the "best" form of each word
716	my (%bestform, %totalfreq, %bestfreq);
717
718	foreach $w (sort (keys %freq)) {
719	$l = lc($w);
720
721	# totalfreq is the number of times a term appears in any form
722	$totalfreq{$l} += $freq{$w};
723
724	if (defined $stopwords{$l}) {
725	$bestform{$l} = $stopwords{$l};
726
727	} elsif (defined $thesaurus{$l}) {
728	$bestform{$l} = $thesaurus{$l};
729
730	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
731	$bestfreq{$l} = $freq{$w};
732	$bestform{$l} = $w;
733	}
734	}
735	undef %freq;
736	undef %bestfreq;
737
738
739	# Assign symbol numbers to tokens
740	my $nextsymbol = 1;
741	my (@vocab);
742
743	# Delimiters
744	$first_delimiter = 1;
745
746	foreach $word (@delimiters) {
747
748	# $word = lc($word); # jrm21
749	$word = uc($word);
750	$bestform{$word} = $word;
751	$vocab[$nextsymbol] = $word;
752	$symbol{$word} = $nextsymbol;
753	$nextsymbol++;
754	}
755	$last_delimiter = $nextsymbol - 1;
756	# Stopwords
757	$first_stopword = $nextsymbol;
758
759	foreach my $word (sort keys %stopwords) {
760	# don't include stopword unless it occurs in the text
761	$word = lc($word);
762	next unless ($totalfreq{$word});
763	next if ($symbol{$word});
764
765	$vocab[$nextsymbol] = $word;
766	$symbol{$word} = $nextsymbol;
767	$nextsymbol++;
768	}
769	$last_stopword = $nextsymbol - 1;
770	$first_contentword = $nextsymbol;
771
772	# Thesaurus terms
773	if ($thesaurus) {
774	$first_thesaurusword = $nextsymbol;
775
776	foreach my $word (sort keys %thesaurus) {
777
778	$word = lc($word);
779	next if ($symbol{$word});
780	$bestform{$word} = $thesaurus{$word};
781
782	$vocab[$nextsymbol] = $word;
783	$symbol{$word} = $nextsymbol;
784	$nextsymbol++;
785
786	}
787	$last_thesaurusword = $nextsymbol - 1;
788	}
789
790	# Other content words
791	$first_extractword = $nextsymbol;
792
793	foreach my $word (sort (keys %bestform)) {
794
795	next if ($symbol{$word});
796
797	$vocab[$nextsymbol] = $word;
798	$symbol{$word} = $nextsymbol;
799	$nextsymbol++;
800	}
801	$last_extractword = $nextsymbol - 1;
802	$last_contentword = $nextsymbol - 1;
803
804	# Outut the words
805	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
806	open(VOC, ">$phinddir/clauses.vocab");
807
808	for (my $i = 1; $i < $nextsymbol; $i++) {
809	$w = $vocab[$i];
810
811	print VOC "$bestform{$w}\n";
812	$totalfreq{$w} = 0 unless ($totalfreq{$w});
813	}
814	close VOC;
815
816
817	# Create statistics file
818	# Output statistics about the vocablary
819	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
820	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
821
822	open(STAT, ">$phinddir/clauses.stats")
823	\|\| die "Cannot open $phinddir/clauses.stats: $!";
824
825	print STAT "first_delimiter $first_delimiter\n";
826	print STAT "last_delimiter $last_delimiter\n";
827	print STAT "first_stopword $first_stopword\n";
828	print STAT "last_stopword $last_stopword\n";
829	if ($thesaurus) {
830	print STAT "first_thesaurusword $first_thesaurusword\n";
831	print STAT "last_thesaurusword $last_thesaurusword\n";
832	}
833	print STAT "first_extractword $first_extractword\n";
834	print STAT "last_extractword $last_extractword\n";
835	print STAT "first_contentword $first_contentword\n";
836	print STAT "last_contentword $last_contentword\n";
837	print STAT "first_symbol $first_delimiter\n";
838	print STAT "last_symbol $last_contentword\n";
839	print STAT "first_word $first_stopword\n";
840	print STAT "last_word $last_contentword\n";
841	close STAT;
842
843	undef @vocab;
844
845
846	# Create numbers file
847	# Save text as symbol numbers
848	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
849
850	open(TXT, "<$phinddir/clauses");
851	open(NUM, ">$phinddir/clauses.numbers");
852
853	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
854	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
855	$phrasedelimiter = $symbol{$senlimit};
856	print NUM "$symbol{$colstart}\n";
857
858	# set up the special symbols that delimit documents and sentences
859	while(<TXT>) {
860
861	# split sentence into a list of tokens
862	$line = $_;
863	next unless ($line =~ /./);
864	@words = split(/\s+/, $line);
865
866	# output one token at a time
867	foreach $word (@words) {
868	# don't lower-case special delimiters - jrm21
869	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
870	$word = lc($word);
871	}
872	print NUM "$symbol{$word}\n";
873	}
874
875	# output phrase delimiter
876	print NUM "$phrasedelimiter\n";
877	}
878
879	close TXT;
880	# print NUM "$symbol{lc($colend)}\n";# jrm21
881	print NUM "$symbol{$colend}\n";
882	close NUM;
883
884	# Save thesaurus data in one convienient file
885	if ($thesaurus) {
886
887	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
888
889
890	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
891	if ($verbosity > 1);
892
893	# Read the thesaurus terms
894	my ($num, $text, %thes_symbols);
895
896	open(TH, "<$thesaurus_terms");
897	while(<TH>) {
898	chomp;
899	@words = split(/\s+/, $_);
900	$num = shift @words;
901	$text = "";
902
903	# translate words into symbol numbers
904	foreach $word (@words) {
905	$word = lc($word);
906	if ($symbol{$word}) {
907	$text .= "s$symbol{$word} ";
908	} elsif ($verbosity) {
909	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
910	}
911	}
912	$text =~ s/ $//;
913	$thes_symbols{$num} = $text;
914	}
915	close TH;
916
917	# Read the thesaurus links and write the corresponding data
918	open(TH, "<$thesaurus_links");
919	open(THOUT, ">$thesaurusfile");
920
921	while(<TH>) {
922	chomp;
923	($num, $text) = split(/:/, $_);
924
925	if (defined($thes_symbols{$num})) {
926	print THOUT "$num:$thes_symbols{$num}:$text\n";
927	} else {
928	print THOUT "$num:untranslated:$text\n";
929	}
930	}
931	close TH;
932	close THOUT;
933	}
934
935
936
937
938	}
939
940
941	# renumber_phrases
942	#
943	# Prepare the phrases file to be input to mgpp. The biggest problem is
944	# reconciling the phrase identifiers used by the suffix program (which
945	# we'll call suffix-id numbers) with the numbers used in the thesaurus
946	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
947	# Phind-id numbers must be sorted by frequency of occurance.
948	#
949	# Start creating a set of phind-id numbers from the sorted suffix-id
950	# numbers and (if required) the thesaurus-id numbers. Then add any other
951	# phrases occuring in the thesaurus.
952	#
953	# The last thing we have to do is restore the vocabulary information to the
954	# phrase file so that the phrases are stored as words, not as symbol
955	# numbers.
956
957	# The original phrases file looks something like this:
958	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
959	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
960	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
961	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
962
963
964	sub renumber_phrases {
965	my ($self) = @_;
966
967	renumber_suffix_data($self);
968	renumber_thesaurus_data($self);
969	restore_vocabulary_data($self);
970
971	}
972
973
974
975	# renumber_suffix_data
976	#
977	# Translate phrases file to phrases.2 using phind keys instead
978	# of suffix keys and sorting the expansion data.
979
980	sub renumber_suffix_data {
981	my ($self) = @_;
982
983	my $verbosity = $self->{'verbosity'};
984	my $out = $self->{'outhandle'};
985	print $out "Translate phrases: suffix-ids become phind-id's\n"
986	if ($verbosity);
987
988	my $phinddir = $self->{'phinddir'};
989	my $infile = &util::filename_cat($phinddir, 'phrases');
990	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
991
992	# Read the phrase file. Calculate initial set of phind-id
993	# numbers and store (suffixid -> frequency) relation.
994
995	my %suffixtophind;
996	my @phindfrequency;
997	my (@fields, $suffixid);
998	my $nextphind = 1;
999
1000	open(IN, "<$infile");
1001	while(<IN>) {
1002
1003	chomp;
1004	@fields = split(/:/, $_);
1005
1006	# get next suffixid and phindid
1007	$suffixid = shift @fields;
1008	$suffixtophind{$suffixid} = $nextphind;
1009
1010	# store total frequency
1011	shift @fields;
1012	$totalfrequency[$nextphind] = shift @fields;
1013
1014	$nextphind++;
1015	}
1016	close IN;
1017
1018
1019	# Translate phrases file to phrases.2. Use phind keys (not suffix
1020	# keys), sort expansion and document occurance data in order of
1021	# descending frequency..
1022	open(IN, "<$infile");
1023	open(OUT, ">$outfile");
1024
1025	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1026	my (@documwents, @newexp, $k, $n);
1027	my $linenumber = 0;
1028
1029	while(<IN>) {
1030
1031	# read the line
1032	chomp;
1033	@fields = split(/:/, $_);
1034
1035	# get a phrase number for this line
1036	$suffixid = shift @fields;
1037	die unless (defined($suffixtophind{$suffixid}));
1038	$phindid = $suffixtophind{$suffixid};
1039
1040	# get the symbols in the phrase
1041	$text = shift @fields;
1042
1043	# output status information
1044	$linenumber++;
1045	if ($verbosity > 2) {
1046	if ($linenumber % 1000 == 0) {
1047	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1048	}
1049	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1050	}
1051
1052	# get the phrase frequency
1053	$tf = shift @fields;
1054
1055	# get the number of expansions
1056	$countexp = shift @fields;
1057
1058	# get the expansions, convert them into phind-id numbers, and sort them
1059	$expansions = shift @fields;
1060	@newexp = ();
1061	foreach $k (split(/,/, $expansions)) {
1062	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1063	$n = $suffixtophind{$k};
1064	push @newexp, $n;
1065	}
1066	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1067
1068	# get the number of documents
1069	$countdocs = shift @fields;
1070
1071	# get the documents and sort them
1072	$documents = shift @fields;
1073	$documents =~ s/d//g;
1074	@documents = split(/;/, $documents);
1075	@documents = sort by_doc_frequency @documents;
1076
1077	# output the phrase data
1078	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1079	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1080
1081	}
1082
1083	close IN;
1084	close OUT;
1085	}
1086
1087
1088	# renumber_thesaurus_data
1089	#
1090	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1091
1092	sub renumber_thesaurus_data {
1093	my ($self) = @_;
1094
1095	my $out = $self->{'outhandle'};
1096	my $verbosity = $self->{'verbosity'};
1097	my $thesaurus = $self->{'thesaurus'};
1098
1099	my $phinddir = $self->{'phinddir'};
1100	my $infile = &util::filename_cat($phinddir, "phrases.2");
1101	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1102
1103
1104	# If no thesaurus is defined, simply move the phrases file.
1105	if (!$thesaurus) {
1106	print $out "Translate phrases.2: no thesaurus data\n"
1107	if ($verbosity);
1108	&util::mv($infile, $outfile);
1109	return;
1110	}
1111
1112	print $out "Translate phrases.2: add thesaurus data\n"
1113	if ($verbosity);
1114
1115	# 1.
1116	# Read thesaurus file and store (symbols->thesaurusid) mapping
1117	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1118	my %symbolstothesid;
1119	my (@fields, $thesid, $symbols);
1120
1121	open(TH, "<$thesaurusfile");
1122
1123	while (<TH>) {
1124
1125	chomp;
1126	@fields = split(/:/, $_);
1127
1128	# get id and text
1129	$thesid = shift @fields;
1130	$symbols = shift @fields;
1131	$symbolstothesid{$symbols} = $thesid;
1132	}
1133	close TH;
1134
1135	# 2.
1136	# Read phrases file to find thesaurus entries that already
1137	# have a phindid. Store their phind-ids for later translation,
1138	# and store their frequency for later sorting.
1139	my %thesaurustophindid;
1140	my %phindidtofrequency;
1141	my ($phindid, $freq);
1142
1143	open(IN, "<$infile");
1144
1145	while(<IN>) {
1146
1147	chomp;
1148	@fields = split(/:/, $_);
1149
1150	# phindid and symbols for this line
1151	$phindid = shift @fields;
1152	$symbols = shift @fields;
1153	$freq = shift @fields;
1154
1155	# do we have a thesaurus id corresponding to this phrase?
1156	if (defined($symbolstothesid{$symbols})) {
1157	$thesid = $symbolstothesid{$symbols};
1158	$thesaurustophindid{$thesid} = $phindid;
1159	$phindidtofrequency{$phindid} = $freq;
1160	}
1161	}
1162	close IN;
1163
1164	undef %symbolstothesid;
1165
1166	# 3.
1167	# Create phind-id numbers for remaining thesaurus entries,
1168	# and note that their frequency is 0 for later sorting.
1169	my $nextphindid = $phindid + 1;
1170
1171	open(TH, "<$thesaurusfile");
1172	while(<TH>) {
1173
1174	chomp;
1175	@fields = split(/:/, $_);
1176
1177	# read thesaurus-id and ensure it has a corresponding phind-id
1178	$thesid = shift @fields;
1179	if (!defined($thesaurustophindid{$thesid})) {
1180	$thesaurustophindid{$thesid} = $nextphindid;
1181	$phindidtofrequency{$nextphindid} = 0;
1182	$nextphindid++;
1183	}
1184	}
1185	close TH;
1186
1187	# 4.
1188	# Translate thesaurus file, replacing thesaurus-id numbers with
1189	# phind-id numbers.
1190	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1191	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1192	my (@links, $linkid, %linkidtotype, $newrelation);
1193
1194	open(TH, "<$thesaurusfile");
1195	open(TO, ">$newthesaurusfile");
1196	while(<TH>) {
1197
1198	chomp;
1199	@fields = split(/:/, $_);
1200
1201	# phindid and symbols for this line
1202	($thesid, $symbols, $relations) = @fields;
1203
1204	die unless ($thesid && $symbols);
1205	die unless $thesaurustophindid{$thesid};
1206	$phindid = $thesaurustophindid{$thesid};
1207
1208	# convert each part of the relation string to use phind-id numbers
1209	# at the same time, we want to sort the list by frequency.
1210	undef %linkidtotype;
1211
1212	foreach $linktext (split(/;/, $relations)) {
1213	@linkdata = split(/,/, $linktext);
1214
1215	# remember the linktype (e.g. BT, NT)
1216	$linktype = shift @linkdata;
1217
1218	# store the type of each link
1219	foreach $thesid (@linkdata) {
1220	die unless (defined($thesaurustophindid{$thesid}));
1221	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1222	}
1223	}
1224
1225	# sort the list of links, first by frequency, then by type.
1226	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1227	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1228	$linkcounter = (scalar @links);
1229
1230	# create a string describing the link information
1231	$linktype = $linkidtotype{$links[0]};
1232	$newrelation = $linktype;
1233	foreach $linkid (@links) {
1234	if ($linkidtotype{$linkid} ne $linktype) {
1235	$linktype = $linkidtotype{$linkid};
1236	$newrelation .= ";" . $linktype;
1237	}
1238	$newrelation .= "," . $linkid;
1239	}
1240	$newrelation .= ";";
1241
1242
1243	# output the new line
1244	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1245	}
1246	close TH;
1247	close TO;
1248
1249	undef %thesaurustophindid;
1250	undef %linkidtotype;
1251	undef %phindidtofrequency;
1252
1253	# 5.
1254	# Read thesaurus data (in phind-id format) into memory
1255	my %thesaurusdata;
1256
1257	open(TH, "<$newthesaurusfile");
1258	while(<TH>) {
1259	chomp;
1260	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1261	die unless ($phindid && $symbols);
1262	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1263	}
1264	close TH;
1265
1266	# 6.
1267	# Add thesaurus data to phrases file
1268	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1269	my (@documwents, @newexp, $k, $n);
1270	my $linenumber = 0;
1271
1272	open(IN, "<$infile");
1273	open(OUT, ">$outfile");
1274
1275	# Update existing phrases
1276	while(<IN>) {
1277
1278	chomp;
1279	@fields = split(/:/, $_);
1280
1281	# get data for this line
1282	$phindid = shift @fields;
1283
1284	# output the phrase data, with thesaurus information
1285	print OUT "$phindid:", join(":", @fields);
1286
1287	# add thesaurus data
1288	if (defined($thesaurusdata{$phindid})) {
1289	@fields = split(/:/, $thesaurusdata{$phindid});
1290	shift @fields;
1291	$linkcounter = shift @fields;
1292	$relations = shift @fields;
1293
1294	print OUT ":$linkcounter:$relations";
1295	$thesaurusdata{$phindid} = "";
1296	}
1297	print OUT "\n";
1298	}
1299	close IN;
1300
1301	# Add phrases that aren't already in the file
1302	foreach $phindid (sort numerically keys %thesaurusdata) {
1303	next unless ($thesaurusdata{$phindid});
1304
1305	@fields = split(/:/, $thesaurusdata{$phindid});
1306	$symbols = shift @fields;
1307	$linkcounter = shift @fields;
1308	$relations = shift @fields;
1309
1310	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1311	}
1312	close OUT;
1313
1314	}
1315
1316	# restore_vocabulary_data
1317	#
1318	# Read phrases.3 and restore vocabulary information. Then write
1319	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1320	# (if requested) to the saved phrases file.
1321
1322	sub restore_vocabulary_data {
1323	my ($self) = @_;
1324
1325	my $out = $self->{'outhandle'};
1326	my $verbosity = $self->{'verbosity'};
1327	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1328
1329	my $phinddir = $self->{'phinddir'};
1330	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1331	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1332	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1333	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1334
1335	my $savephrases = $self->{'savephrases'};
1336
1337	# 1.
1338	# Read the vocabulary file
1339	open(V, "<$vocabfile")
1340	\|\| die "Cannot open $vocabfile: $!";
1341	my @symbol;
1342	my $i = 1;
1343	while(<V>) {
1344	chomp;
1345	$symbol[$i++] = $_;
1346	}
1347	close V;
1348
1349	# 2.
1350	# Translate phrases.3 to MGPP input files
1351	my ($key, $text, $word, $isThesaurus);
1352	my @fields;
1353	my $linenumber = 0;
1354
1355	open(IN, "<$infile");
1356	open(DATA, ">$datafile");
1357	open(WORD, ">$wordfile");
1358
1359	# Save the phrases in a separate text file
1360	if ($savephrases) {
1361	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1362	open(SAVE, ">$savephrases");
1363	}
1364
1365	while(<IN>) {
1366
1367	# read the line
1368	chomp;
1369	$line = $_;
1370	@fields = split(/:/, $line);
1371
1372	# get a phrase number for this line
1373	$key = shift @fields;
1374
1375	# restore the text of the phrase
1376	$text = shift @fields;
1377	$text =~ s/s(\d+)/$symbol[$1]/g;
1378	if ($text =~ / /) {
1379	$word = "";
1380	} elsif ($text ne 'untranslated') {
1381	$word = $text;
1382	}
1383
1384	# output the phrase data
1385	print DATA "<Document>";
1386	print DATA "$key:$text:", join(":", @fields), ":\n";
1387
1388	# output the word index search data
1389	print WORD "<Document>$word\n";
1390
1391	# output the phrases to a text file
1392	if ($savephrases) {
1393	if ((scalar @fields) == 7) {
1394	$isThesaurus = 1;
1395	} else {
1396	$isThesaurus = 0;
1397	}
1398	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1399	}
1400	}
1401	close IN;
1402	close WORD;
1403	close DATA;
1404	close SAVE if ($savephrases);
1405
1406	}
1407
1408
1409
1410	# sort routines used to renumber phrases
1411
1412	sub numerically { $a <=> $b }
1413
1414	sub by_doc_frequency {
1415	my $fa = 1;
1416	if ($a =~ /,/) {
1417	$fa = $a;
1418	$fa =~ s/\d+,//;
1419	}
1420	my $fb = 1;
1421	if ($b =~ /,/) {
1422	$fb = $b;
1423	$fb =~ s/\d+,//;
1424	}
1425
1426	return ($fb <=> $fa);
1427	}
1428
1429	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: