Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 2576

Last change on this file since 2576 was 2576, checked in by sjboddie, 23 years ago
Moved phind's stopword directory from etc to etc/packages/phind
Property svn:keywords set to `Author Date Id Revision`
File size: 38.1 KB

Rev	Line
[1643]	1	###########################################################################
	2	#
	3	# phind.pm -- the Phind classifier
	4	#
	5	# Copyright (C) 2000 Gordon W. Paynter
	6	# Copyright (C) 2000 New Zealand Digital Library Project
	7	#
	8	#
	9	# A component of the Greenstone digital library software
	10	# from the New Zealand Digital Library Project at the
	11	# University of Waikato, New Zealand.
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
[1890]	29	# The phind clasifier plugin.
	30	# Options are dexcribed in the print_usage function.
	31	# Type "classinfo.pl phind" at the command line for a summary.
[1643]	32
	33	package phind;
	34
	35	use BasClas;
	36	use util;
[1897]	37	use ghtml;
	38	use unicode;
[1643]	39
[2509]	40	my @removedirs = ();
	41
	42	my %wanted_index_files = ('td'=>1,
	43	't'=>1,
	44	'ti'=>1,
	45	'tl'=>1,
	46	'tsd'=>1,
	47	'idb'=>1,
	48	'ib1'=>1,
	49	'ib2'=>1,
	50	'ib3'=>1,
	51	'i'=>1,
	52	'il'=>1,
	53	'w'=>1,
	54	'wa'=>1);
	55
[1643]	56	sub BEGIN {
	57	@ISA = ('BasClas');
	58	}
	59
[2509]	60	sub END {
	61
	62	# Tidy up stray files - we do this here as there's some weird problem
	63	# preventing us from doing it in the get_classify_info() function (on
	64	# windows at least) where the close() appears to fail on txthandle and
	65	# dochandle, thus preventing us from deleting those files
[1643]	66
[2509]	67	foreach my $dir (@removedirs) {
	68	if (-d $dir && opendir (DIR, $dir)) {
	69	my @files = readdir DIR;
	70	closedir DIR;
	71
	72	foreach $file (@files) {
	73	next if $file =~ /^\.\.?$/;
	74	my ($suffix) = $file =~ /\.([^\.]+)$/;
	75	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
	76	# delete it!
	77	&util::rm (&util::filename_cat ($dir, $file));
	78	}
	79	}
	80	}
	81	}
	82	}
	83
[1839]	84	sub print_usage {
	85	print STDERR "
	86	usage: classify phind [options]
	87
	88	options:
[1883]	89	-text Fields The text used to build the phrase hierarchy.
	90	(default: 'section:Title,section:text')
[1839]	91
[1883]	92	-title Title The metadata field used to describe each document.
	93	(default: 'Title')
	94
	95	-button Name The label for the classifier screen and button in
	96	navigation bar.
	97	(default: 'Phrase')
	98
	99	-language Regex Language or languages to use building hierarchy.
	100	Languages are identified by two-letter country codes
	101	like en (English), es (Spanish), and fr (French).
	102	Language is a regular expression, so 'en\|fr' (English or
	103	French) and '..' (match any language) are valid.
	104	(default: 'en'.)
	105
	106	-savephrases File If set, the phrase infomation will be stored in
	107	the given file as text. It is probably a good idea
	108	to use an absolute path.
	109	(defualt: not set)
	110
	111	-suffixmode N The smode parameter to the phrase extraction program. A
	112	value of 0 means that stopwords are ignored, and of 1
	113	means that stopwords are used.
	114	(default: 1)
	115
	116	-thesaurus Name Name of a thesaurus stored in phind format in the
	117	collection's etc directory.
	118	(default: not set)
	119
	120	-untidy Don't remove working files.
	121
	122	"; }
	123
[1890]	124	# Phrase delimiter symbols - these should be abstracted out someplace
	125
	126	my $colstart = "COLLECTIONSTART";
	127	my $colend = "COLLECTIONEND";
	128	my $doclimit = "DOCUMENTLIMIT";
	129	my $senlimit = "SENTENCELIMIT";
	130	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
	131
	132
[1839]	133	# Create a new phind browser based on collect.cfg
	134
[1643]	135	sub new {
[1839]	136	my $class = shift (@_);
	137	my $self = new BasClas($class, @_);
[1643]	138
[1839]	139	my $out = $self->{'outhandle'};
	140
[1643]	141	# Ensure the Phind generate scripts are in place
	142	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
[2487]	143	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
[1643]	144	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
	145
	146	if (!(-e $file1)) {
[2500]	147	print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
[1643]	148	exit(1);
	149	}
	150
[1839]	151	# Parse classifier arguments
[1803]	152	my $builddir = "";
[1839]	153	if (!parsargv::parse(\@_,
	154	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
	155	q^title/.*/Title^, \$self->{'titlefield'},
	156	q^button/.*/Phrase^, \$self->{'buttonname'},
[1871]	157	q^language/.*/en^, \$language,
[1839]	158	q^builddir/.*/^, \$builddir,
	159	q^savephrases/\d/0^, \$self->{'savephrases'},
	160	q^suffixmode/\d/1^, \$self->{'suffixmode'},
	161	q^thesaurus/.*/^, \$self->{'thesaurus'},
	162	q^untidy^, \$self->{'untidy'},
	163	"allow_extra_options")) {
	164
	165	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
	166	&print_usage();
	167	die "\n";
[1643]	168	}
[1839]	169
[1803]	170	# classifier information
[1839]	171	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
[1646]	172
[1643]	173	# limit languages
	174	$self->{'language_exp'} = $language;
[1839]	175
	176	# collection directories
[1829]	177	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
[1803]	178	if (!$builddir) {
	179	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
[1643]	180	}
[1803]	181	$self->{'builddir'} = $builddir;
[2487]	182
	183	$self->{'total'} = 0;
[1839]	184
[1643]	185	return bless $self, $class;
	186	}
	187
	188
	189	# Initialise the phind classifier
	190
	191	sub init {
	192	my $self = shift (@_);
	193
[1803]	194	# ensure we have a build directory
	195	my $builddir = $self->{'builddir'};
	196	die unless (-e "$builddir");
	197
	198	# create phind directory
[2025]	199	my $phnumber = 1;
	200	my $phinddir = &util::filename_cat($builddir, "phind1");
	201	while (-e "$phinddir") {
	202	$phnumber++;
	203	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
[1803]	204	}
	205	&util::mk_dir("$phinddir");
[2025]	206	$self->{'phinddir'} = $phinddir;
	207	$self->{'phindnumber'} = $phnumber;
[1803]	208
[2509]	209	push(@removedirs, $phinddir) unless $self->{'untidy'};
	210
[1643]	211	# open filehandles for documents and text
[1803]	212	my $clausefile = &util::filename_cat("$phinddir", "clauses");
[1643]	213	&util::rm($clausefile) if (-e $clausefile);
	214
[2025]	215	my $txthandle = 'TEXT' . $phnumber;
	216	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
	217	$self->{'txthandle'} = $txthandle;
	218
[1803]	219	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
[1643]	220	&util::rm($docfile) if (-e $docfile);
[2025]	221
	222	my $dochandle = 'DOC' . $phnumber;
	223	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
	224	$self->{'dochandle'} = $dochandle;
[1643]	225
	226	}
	227
[1890]	228
[1643]	229	# Classify each document.
	230	#
	231	# Each document is passed here in turn. The classifier extracts the
	232	# text of each and stores it in the clauses file. Document details are
	233	# stored in the docs.txt file.
	234
	235	sub classify {
	236	my $self = shift (@_);
[2025]	237	my $doc_obj = shift @_;
[1643]	238
	239	my $verbosity = $self->{'verbosity'};
	240	my $top_section = $doc_obj->get_top_section();
	241
[1803]	242	my $titlefield = $self->{'titlefield'};
	243
	244	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[1643]	245	print "process: $title\n" if ($verbosity > 2);
	246
[1883]	247	# Only consider the file if it is in the correct language
[1643]	248	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
	249	my $phrlanguage = $self->{'language_exp'};
	250	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
	251
	252	# record this file
[2487]	253	$self->{'total'} ++;
	254	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[1643]	255
	256
	257	# Store document details
	258	my $OID = $doc_obj->get_OID();
	259	$OID = "NULL" unless defined $OID;
	260	my $dochandle = $self->{'dochandle'};
	261	print $dochandle "<Document>\t$OID\t$title\n";
	262
	263	# Store the text occuring in this object
	264
	265	# output the document delimiter
	266	my $txthandle = $self->{'txthandle'};
	267	print $txthandle "$doclimit\n";
	268
	269	# iterarate over the required indexes and store their text
	270	my $indexes = $self->{'indexes'};
	271	my $text = "";
[1890]	272	my ($part, $level, $field, $section, $data, $dataref);
[1643]	273
	274	foreach $part (split(/,/, $indexes)) {
	275
	276	# Each field has a level and a data element ((e.g. document:Title)
	277	($level, $field) = split(/:/, $part);
	278	die unless ($level && $field);
[2025]	279
[1643]	280	# Extract the text from every section
	281	# (In phind, document:text and section:text are equivalent)
	282	if ($field eq "text") {
	283	$data = "";
	284	$section = $doc_obj->get_top_section();
	285	while (defined($section)) {
	286	$data .= $doc_obj->get_text($section) . "\n";
	287	$section = $doc_obj->get_next_section($section);
	288	}
[1949]	289	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
[1643]	290	}
	291
	292	# Extract a metadata field from a document
[1890]	293	# (If ther eis more than one element of the given type, get them all.)
[1643]	294	elsif ($level eq "document") {
[1890]	295	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
[2206]	296	foreach $data (@$dataref) {
[1949]	297	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
[1890]	298	}
	299	}
	300
[1643]	301	# Extract metadata from every section in a document
	302	elsif ($level eq "section") {
	303	$data = "";
	304	$section = $doc_obj->get_top_section();
	305	while (defined($section)) {
[2025]	306	$dataref = $doc_obj->get_metadata($section, $field);
	307	$data .= join("\n", @$dataref) . "\n";
[1643]	308	$section = $doc_obj->get_next_section($section);
	309	}
[1949]	310	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
[1643]	311	}
	312
	313	# Some sort of specification which I don't understand
	314	else {
	315	die "Unknown level ($level) in phind index ($part)\n";
	316	}
	317
	318	}
[2025]	319
[1643]	320	# output the text
	321	$text =~ tr/\n//s;
	322	print $txthandle "$text";
	323	}
	324
	325
	326	# Construct the classifier from the information already gathered
	327	#
	328	# When get_classify_info is called, the clauses and docs.txt files have
[1803]	329	# already been constructed in the phind directory. This function will
[1643]	330	# translate them into compressed, indexed MGPP files that can be read by
	331	# the phindcgi script. It will also register our classifier so that it
[1803]	332	# shows up in the navigation bar.
[1643]	333
	334	sub get_classify_info {
	335	my $self = shift (@_);
	336
[2487]	337	close $self->{'dochandle'};
[2333]	338	close $self->{'txthandle'};
[1643]	339	my $verbosity = $self->{'verbosity'};
[1839]	340	my $out = $self->{'outhandle'};
[1883]	341	my $phinddir = $self->{'phinddir'};
[1839]	342
[2487]	343	my $osextra = "";
	344	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
	345	$osextra = " -d /";
	346	}
	347
[1645]	348	if ($verbosity) {
[1839]	349	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
[2539]	350	print $out "*** in ", $self->{'phinddir'}, "\n";
[1645]	351	}
	352
[1643]	353	# Construct phind indexes
[1646]	354	my $suffixmode = $self->{'suffixmode'};
[1643]	355	my ($command, $status);
	356
	357	# Generate the vocabulary, symbol statistics, and numbers file
	358	# from the clauses file
[1839]	359	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[1829]	360	&extract_vocabulary($self);
[1643]	361
[1803]	362	# Use the suffix program to generate the phind/phrases file
[1839]	363	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[2529]	364	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
[1643]	365
[1803]	366	# Create the phrase file and put phrase numbers in phind/phrases
[2025]	367	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[1808]	368	&renumber_phrases($self);
[1643]	369
[1839]	370	print $out "\nCreating phrase databases\n";
[1803]	371	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
[2487]	372	my $mg_stem = &util::filename_cat($phinddir, "pdata");
[1643]	373
[2529]	374	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	375	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	376	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
[1643]	377
	378	# create the mg index of words
[1839]	379	print $out "\nCreating word-level search indexes\n";
[1803]	380	$mg_input = &util::filename_cat($phinddir, "pword.txt");
[2487]	381	$mg_stem = &util::filename_cat($phinddir, "pword");
[1643]	382
[2529]	383	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
	384	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	385	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	386	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
	387	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	388	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
[1643]	389
[2529]	390	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
	391	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
	392	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
[1643]	393
	394	# create the mg document information database
[1839]	395	print $out "\nCreating document information databases\n";
[1803]	396	$mg_input = &util::filename_cat($phinddir, "docs.txt");
[2487]	397	$mg_stem = &util::filename_cat($phinddir, "docs");
[1643]	398
[2529]	399	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	400	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	401	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
[1643]	402
[2025]	403	# Return the information about the classifier that we'll later want to
	404	# use to create macros when the Phind classifier document is displayed.
[1643]	405	my %classifyinfo = ('thistype'=>'Invisible',
[2025]	406	'childtype'=>'Phind',
	407	'Title'=>$self->{'buttonname'},
	408	'parameters'=>"phindnumber=$self->{'phindnumber'}",
[1643]	409	'contains'=>[]);
	410
[2025]	411	my $collection = $self->{'collection'};
	412	my $url = "library?a=p&p=phind&c=$collection";
[1643]	413	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
[2025]	414
[1643]	415	return \%classifyinfo;
	416	}
	417
	418
	419
	420	sub convert_gml_to_tokens {
	421
[1949]	422	my ($language_exp, $text) = @_;
[1643]	423
[1897]	424	if ($language_exp =~ /en/) {
[1949]	425	return &convert_gml_to_tokens_EN($text);
[1897]	426	}
	427
[1890]	428	$_ = $text;
[1643]	429
[2008]	430	# 1. remove GML tags
[1643]	431
	432	# Remove everything that is in a tag
[1890]	433	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	434	s/\s<br>\s/ LINEBREAK /isgo;
	435	s/<[^>]*>/ /sgo;
[1643]	436
	437	# Now we have the text, but it may contain HTML
	438	# elements coded as > etc. Remove these tags.
[2008]	439	s/&/&/sgo;
[1890]	440	s/</</sgo;
	441	s/>/>/sgo;
	442	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	443	s/\s<br>\s/ LINEBREAK /isgo;
	444	s/<[^>]*>/ /sgo;
[1643]	445
[2008]	446	# replace<p> and <br> placeholders with clause break symbol (\n)
	447	s/\s+/ /gso;
[1890]	448	s/PARAGRAPHBREAK/\n/sgo;
	449	s/LINEBREAK/\n/sgo;
[1643]	450
[1890]	451
[1897]	452
	453
[2008]	454	# 2. Split the remaining text into space-delimited tokens
[1897]	455
[2008]	456	# Convert any HTML special characters (like ") to their UTF8 equivalent
	457	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[1897]	458
[2008]	459	# Split text at word boundaries
	460	s/\b/ /go;
	461
	462	# 3. Convert the remaining text to "clause format"
	463
	464	# Insert newline if the end of a sentence is detected
[1897]	465	# (delimter is: "[\.\?\!]\s")
[2008]	466	# s/\s*[\.\?\!]\s+/\n/go;
[1897]	467
[2008]	468	# remove unnecessary punctuation and replace with clause break symbol (\n)
	469	s/[^\w ]/\n/go;
	470
	471	# remove extraneous whitespace
[1897]	472	s/ +/ /sgo;
	473	s/^\s+//mgo;
	474	s/\s*$/\n/mgo;
[1643]	475
[1897]	476	# remove lines that contain one word or less
	477	s/^\S*$//mgo;
	478	s/^\s*$//mgo;
	479	tr/\n//s;
[1643]	480
[1897]	481	return $_;
	482	}
[1643]	483
[1897]	484	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
[1643]	485
[1897]	486	sub convert_gml_to_tokens_EN {
	487	$_ = shift @_;
[1643]	488
[1897]	489	# FIRST, remove GML tags
[1643]	490
[1897]	491	# Replace all whitespace with a simple space
	492	s/\s+/ /gs;
[1890]	493
[1897]	494	# Remove everything that is in a tag
	495	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	496	s/\s<br>\s/ LINEBREAK /isg;
	497	s/<[^>]*>/ /sg;
	498
	499	# Now we have the text, but it may contain HTML
	500	# elements coded as > etc. Remove these tags.
	501	s/</</sg;
	502	s/>/>/sg;
	503
	504	s/\s+/ /sg;
	505	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	506	s/\s<br>\s/ LINEBREAK /isg;
	507	s/<[^>]*>/ /sg;
	508
	509	# remove & and other miscellaneous markup tags
	510	s/&/&/sg;
	511	s/</</sg;
	512	s/>/>/sg;
	513	s/&/&/sg;
	514
	515	# replace<p> and <br> placeholders with carriage returns
	516	s/PARAGRAPHBREAK/\n/sg;
	517	s/LINEBREAK/\n/sg;
	518
	519
	520	# Exceptional punctuation
	521	#
	522	# We make special cases of some punctuation
	523
	524	# remove any apostrophe that indicates omitted letters
	525	s/(\w+)\'(\w*\s)/ $1$2 /g;
	526
	527	# remove period that appears in a person's initals
	528	s/\s([A-Z])\./ $1 /g;
	529
	530	# replace hyphens in hypheanted words and names with a space
	531	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
	532
[1643]	533	# Convert the remaining text to "clause format",
	534	# This means removing all excess punctuation and garbage text,
	535	# normalising valid punctuation to fullstops and commas,
[1897]	536	# then putting one cluse on each line.
[1643]	537
	538	# Insert newline when the end of a sentence is detected
	539	# (delimter is: "[\.\?\!]\s")
[1897]	540	s/\s*[\.\?\!]\s+/\n/g;
[1643]	541
[1897]	542	# split numbers after four digits
	543	s/(\d\d\d\d)/$1 /g;
[1643]	544
[1897]	545	# split words after 32 characters
[1643]	546
[1897]	547	# squash repeated punctuation
	548	tr/A-Za-z0-9 //cs;
[1643]	549
[1897]	550	# save email addresses
	551	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
[1643]	552
[1897]	553	# normalise clause breaks (mostly punctuation symbols) to commas
	554	s/[^A-Za-z0-9 \n]+/ , /g;
[1643]	555
[1897]	556	# Remove repeated commas, and replace with newline
	557	s/\s*,[, ]+/\n/g;
[1643]	558
	559	# remove extra whitespace
[1897]	560	s/ +/ /sg;
	561	s/^\s+//mg;
	562	s/\s*$/\n/mg;
[1643]	563
	564	# remove lines that contain one word or less
[1897]	565	s/^\w*$//mg;
	566	s/^\s*$//mg;
[1643]	567	tr/\n//s;
	568
	569	return $_;
[1897]	570
[1643]	571	}
	572
	573
[1897]	574
[1643]	575	# Execute a system command
	576
	577	sub execute {
[1839]	578	my ($command, $verbosity, $outhandle) = @_;
	579	print $outhandle "Executing: $command\n" if ($verbosity > 2);
[2487]	580	$! = 0;
[1643]	581	my $status = system($command);
	582	if ($status != 0) {
[2487]	583	print STDERR "phind - Error executing '$command': $!\n";
[1643]	584	exit($status);
	585	}
	586	}
	587
	588
	589	# Generate the vocabulary, symbol statistics, and numbers file from the
	590	# clauses file. This is legacy code, so is a bit messy and probably wont
	591	# run under windows.
	592
	593	sub extract_vocabulary {
[1829]	594	my ($self) = @_;
	595
	596	my $verbosity = $self->{'verbosity'};
[1839]	597	my $out = $self->{'outhandle'};
[1643]	598
[1829]	599	my $collectiondir = $self->{'collectiondir'};
	600	my $phinddir = $self->{'phinddir'};
	601
[1883]	602	my $language_exp = $self->{'language_exp'};
	603
[1643]	604	my ($w, $l, $line, $word);
[1829]	605
[1643]	606	my ($first_delimiter, $last_delimiter,
	607	$first_stopword, $last_stopword,
	608	$first_extractword, $last_extractword,
	609	$first_contentword, $last_contentword,
	610	$phrasedelimiter);
[1829]	611
	612	my $thesaurus = $self->{'thesaurus'};
	613	my ($thesaurus_links, $thesaurus_terms,
	614	%thesaurus, $first_thesaurusword, $last_thesaurusword);
[1643]	615
	616	my %symbol;
	617	my (%freq);
	618
[1829]	619	print $out "Calculating vocabulary\n" if ($verbosity > 1);
[1643]	620
	621	# Read and store the stopwords
[2576]	622	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
[1883]	623	my $stopword_files = ();
	624	my ($language, $language_dir, $file, $file_name);
[1643]	625	my %stopwords;
[1883]	626
	627	# Examine each directory in the stopword directory
	628	opendir(STOPDIR, $stopdir);
	629	foreach $language (readdir STOPDIR) {
	630
	631	# Ignore entries that do not match the classifier's language
	632	next unless ($language =~ /$language_exp/);
	633	$language_dir = &util::filename_cat($stopdir, $language);
	634	next unless (-d "$language_dir");
	635
	636	opendir(LANGDIR, $language_dir);
	637	foreach $file (readdir LANGDIR) {
	638
	639	# Ignore entries that are not stopword files
	640	next unless ($file =~ /sw$/);
	641	$file_name = &util::filename_cat($language_dir, $file);
	642	next unless (-f "$file_name");
	643
	644	# Read the stopwords
	645	open(STOPFILE, "<$file_name");
	646	while (<STOPFILE>) {
	647	s/^\s+//;
	648	s/\s.*//;
	649	$word = $_;
	650	$l = lc($word);
	651	$stopwords{$l} = $word;
	652	}
	653	close STOPFILE;
	654
	655	}
[2333]	656	closedir LANGDIR;
[1643]	657	}
[2333]	658	closedir STOPDIR;
[1883]	659
[1829]	660	# Read thesaurus information
	661	if ($thesaurus) {
	662
[1897]	663	# link file exists
[1829]	664	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
	665	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
[1897]	666
	667	# ensure term file exists in the correct language
	668	if ($language_exp =~ /^([a-z][a-z])/) {
	669	$language = $1;
	670	} else {
	671	$language = 'en';
	672	}
	673	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
[1829]	674	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
[1897]	675
[1829]	676
	677	# Read the thesaurus terms
	678	open(TH, "<$thesaurus_terms");
[1643]	679	while(<TH>) {
	680	s/^\d+ //;
	681	s/\(.*\)//;
[1829]	682	foreach $w (split(/\s+/, $_)) {
[1643]	683	$thesaurus{lc($w)} = $w;
	684	}
	685	}
	686	close TH;
	687	}
	688
	689	# Read words in the text and count occurences
[1829]	690	open(TXT, "<$phinddir/clauses");
[1643]	691	my @words;
	692
	693	while(<TXT>) {
	694	$line = $_;
	695	next unless ($line =~ /./);
	696
	697	@words = split(/\s+/, $line);
	698	foreach $w (@words) {
	699	$l = lc($w);
	700	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
	701	$freq{$w}++;
	702	}
	703	$freq{$senlimit}++;
	704	}
	705
[2333]	706	close TXT;
	707
[1643]	708	# Calculate the "best" form of each word
	709	my (%bestform, %totalfreq, %bestfreq);
	710
	711	foreach $w (sort (keys %freq)) {
	712	$l = lc($w);
	713
	714	# totalfreq is the number of times a term appears in any form
	715	$totalfreq{$l} += $freq{$w};
	716
	717	if (defined $stopwords{$l}) {
	718	$bestform{$l} = $stopwords{$l};
	719
	720	} elsif (defined $thesaurus{$l}) {
	721	$bestform{$l} = $thesaurus{$l};
	722
	723	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
	724	$bestfreq{$l} = $freq{$w};
	725	$bestform{$l} = $w;
	726	}
	727	}
	728
	729	undef %freq;
	730	undef %bestfreq;
	731
	732
	733	# Assign symbol numbers to tokens
	734	my $nextsymbol = 1;
	735	my (@vocab);
	736
	737	# Delimiters
	738	$first_delimiter = 1;
	739
	740	foreach $word (@delimiters) {
	741
	742	$word = lc($word);
	743	$bestform{$word} = uc($word);
	744	$vocab[$nextsymbol] = $word;
	745	$symbol{$word} = $nextsymbol;
	746	$nextsymbol++;
	747	}
	748	$last_delimiter = $nextsymbol - 1;
	749
	750	# Stopwords
	751	$first_stopword = $nextsymbol;
	752
	753	foreach my $word (sort keys %stopwords) {
	754
	755	# don't incluse stopword unless it occurs in the text
	756	$word = lc($word);
	757	next unless ($totalfreq{$word});
	758	next if ($symbol{$word});
	759
	760	$vocab[$nextsymbol] = $word;
	761	$symbol{$word} = $nextsymbol;
	762	$nextsymbol++;
	763	}
	764	$last_stopword = $nextsymbol - 1;
	765	$first_contentword = $nextsymbol;
	766
	767	# Thesaurus terms
[1829]	768	if ($thesaurus) {
[1643]	769	$first_thesaurusword = $nextsymbol;
	770
	771	foreach my $word (sort keys %thesaurus) {
	772
	773	$word = lc($word);
	774	next if ($symbol{$word});
	775	$bestform{$word} = $thesaurus{$word};
	776
	777	$vocab[$nextsymbol] = $word;
	778	$symbol{$word} = $nextsymbol;
	779	$nextsymbol++;
	780
	781	}
	782	$last_thesaurusword = $nextsymbol - 1;
	783	}
	784
	785	# Other content words
	786	$first_extractword = $nextsymbol;
	787
	788	foreach my $word (sort (keys %bestform)) {
	789
	790	next if ($symbol{$word});
	791
	792	$vocab[$nextsymbol] = $word;
	793	$symbol{$word} = $nextsymbol;
	794	$nextsymbol++;
	795	}
	796	$last_extractword = $nextsymbol - 1;
	797	$last_contentword = $nextsymbol - 1;
	798
	799
	800	# Outut the words
[1829]	801	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
	802	open(VOC, ">$phinddir/clauses.vocab");
[1643]	803
	804	for (my $i = 1; $i < $nextsymbol; $i++) {
	805	$w = $vocab[$i];
	806
	807	print VOC "$bestform{$w}\n";
	808	$totalfreq{$w} = 0 unless ($totalfreq{$w});
	809	}
	810	close VOC;
	811
	812
[1829]	813	# Create statistics file
[1643]	814	# Output statistics about the vocablary
[1829]	815	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
	816	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
[1643]	817
[1829]	818	open(STAT, ">$phinddir/clauses.stats")
	819	\|\| die "Cannot open $phinddir/clauses.stats: $!";
	820
[1643]	821	print STAT "first_delimiter $first_delimiter\n";
	822	print STAT "last_delimiter $last_delimiter\n";
	823	print STAT "first_stopword $first_stopword\n";
	824	print STAT "last_stopword $last_stopword\n";
[1829]	825	if ($thesaurus) {
[1643]	826	print STAT "first_thesaurusword $first_thesaurusword\n";
	827	print STAT "last_thesaurusword $last_thesaurusword\n";
	828	}
	829	print STAT "first_extractword $first_extractword\n";
	830	print STAT "last_extractword $last_extractword\n";
	831	print STAT "first_contentword $first_contentword\n";
	832	print STAT "last_contentword $last_contentword\n";
	833	print STAT "first_symbol $first_delimiter\n";
	834	print STAT "last_symbol $last_contentword\n";
	835	print STAT "first_word $first_stopword\n";
	836	print STAT "last_word $last_contentword\n";
	837	close STAT;
	838
	839	undef @vocab;
	840
	841
[1829]	842	# Create numbers file
[1643]	843	# Save text as symbol numbers
[1829]	844	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
[1643]	845
[1829]	846	open(TXT, "<$phinddir/clauses");
	847	open(NUM, ">$phinddir/clauses.numbers");
[1643]	848
	849	$phrasedelimiter = $symbol{lc($senlimit)};
	850	print NUM "$symbol{lc($colstart)}\n";
	851
	852	# set up the special symbols that delimit documents and sentences
	853	while(<TXT>) {
	854
	855	# split sentence into a list of tokens
	856	$line = $_;
	857	next unless ($line =~ /./);
	858	@words = split(/\s+/, $line);
	859
	860	# output one token at a time
	861	foreach $word (@words) {
	862	$word = lc($word);
	863	print NUM "$symbol{$word}\n";
	864	}
	865
	866	# output phrase delimiter
	867	print NUM "$phrasedelimiter\n";
	868	}
	869
[2333]	870	close TXT;
[1643]	871	print NUM "$symbol{lc($colend)}\n";
[1829]	872	close NUM;
[1643]	873
[1829]	874	# Save thesaurus data in one convienient file
	875	if ($thesaurus) {
	876
	877	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	878
	879
	880	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
	881	if ($verbosity > 1);
	882
	883	# Read the thesaurus terms
	884	my ($num, $text, %thes_symbols);
	885
	886	open(TH, "<$thesaurus_terms");
	887	while(<TH>) {
	888	chomp;
	889	@words = split(/\s+/, $_);
	890	$num = shift @words;
	891	$text = "";
	892
	893	# translate words into symbol numbers
	894	foreach $word (@words) {
	895	$word = lc($word);
	896	if ($symbol{$word}) {
	897	$text .= "s$symbol{$word} ";
	898	} elsif ($verbosity) {
	899	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
	900	}
	901	}
	902	$text =~ s/ $//;
	903	$thes_symbols{$num} = $text;
	904	}
	905	close TH;
	906
	907	# Read the thesaurus links and write the corresponding data
	908	open(TH, "<$thesaurus_links");
	909	open(THOUT, ">$thesaurusfile");
	910
	911	while(<TH>) {
	912	chomp;
	913	($num, $text) = split(/:/, $_);
	914
	915	if (defined($thes_symbols{$num})) {
	916	print THOUT "$num:$thes_symbols{$num}:$text\n";
	917	} else {
	918	print THOUT "$num:untranslated:$text\n";
	919	}
	920	}
	921	close TH;
	922	close THOUT;
	923	}
	924
	925
	926
	927
[1643]	928	}
	929
	930
[1829]	931	# renumber_phrases
	932	#
	933	# Prepare the phrases file to be input to mgpp. The biggest problem is
	934	# reconciling the phrase identifiers used by the suffix program (which
	935	# we'll call suffix-id numbers) with the numbers used in the thesaurus
	936	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
	937	# Phind-id numbers must be sorted by frequency of occurance.
	938	#
	939	# Start creating a set of phind-id numbers from the sorted suffix-id
	940	# numbers and (if required) the thesaurus-id numbers. Then add any other
	941	# phrases occuring in the thesaurus.
	942	#
	943	# The last thing we have to do is restore the vocabulary information to the
	944	# phrase file so that the phrases are stored as words, not as symbol
	945	# numbers.
[1643]	946
[1829]	947	# The original phrases file looks something like this:
	948	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
	949	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
	950	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
	951	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
[1643]	952
	953
	954	sub renumber_phrases {
[1829]	955	my ($self) = @_;
[1643]	956
[1829]	957	renumber_suffix_data($self);
	958	renumber_thesaurus_data($self);
	959	restore_vocabulary_data($self);
[1808]	960
[1829]	961	}
[1808]	962
	963
[1643]	964
[1829]	965	# renumber_suffix_data
	966	#
	967	# Translate phrases file to phrases.2 using phind keys instead
	968	# of suffix keys and sorting the expansion data.
	969
	970	sub renumber_suffix_data {
	971	my ($self) = @_;
[1643]	972
[1829]	973	my $verbosity = $self->{'verbosity'};
[1839]	974	my $out = $self->{'outhandle'};
[1829]	975	print $out "Translate phrases: suffix-ids become phind-id's\n"
	976	if ($verbosity);
	977
	978	my $phinddir = $self->{'phinddir'};
	979	my $infile = &util::filename_cat($phinddir, 'phrases');
	980	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
[1643]	981
[1829]	982	# Read the phrase file. Calculate initial set of phind-id
	983	# numbers and store (suffixid -> frequency) relation.
[1643]	984
[1829]	985	my %suffixtophind;
	986	my @phindfrequency;
	987	my (@fields, $suffixid);
	988	my $nextphind = 1;
[1643]	989
[1829]	990	open(IN, "<$infile");
[1643]	991	while(<IN>) {
[1829]	992
[1643]	993	chomp;
[1829]	994	@fields = split(/:/, $_);
[1643]	995
[1829]	996	# get next suffixid and phindid
	997	$suffixid = shift @fields;
	998	$suffixtophind{$suffixid} = $nextphind;
	999
	1000	# store total frequency
	1001	shift @fields;
	1002	$totalfrequency[$nextphind] = shift @fields;
	1003
	1004	$nextphind++;
[1643]	1005	}
[1829]	1006	close IN;
[1808]	1007
[1829]	1008
	1009	# Translate phrases file to phrases.2. Use phind keys (not suffix
	1010	# keys), sort expansion and document occurance data in order of
	1011	# descending frequency..
	1012	open(IN, "<$infile");
	1013	open(OUT, ">$outfile");
[1643]	1014
[1829]	1015	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
	1016	my (@documwents, @newexp, $k, $n);
[1643]	1017	my $linenumber = 0;
	1018
	1019	while(<IN>) {
	1020
	1021	# read the line
	1022	chomp;
[1829]	1023	@fields = split(/:/, $_);
[1643]	1024
	1025	# get a phrase number for this line
[1829]	1026	$suffixid = shift @fields;
	1027	die unless (defined($suffixtophind{$suffixid}));
	1028	$phindid = $suffixtophind{$suffixid};
[1643]	1029
[1829]	1030	# get the symbols in the phrase
[1643]	1031	$text = shift @fields;
	1032
[1829]	1033	# output status information
[1643]	1034	$linenumber++;
[1829]	1035	if ($verbosity > 2) {
	1036	if ($linenumber % 1000 == 0) {
	1037	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
	1038	}
	1039	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[1643]	1040	}
[1829]	1041
[1643]	1042	# get the phrase frequency
	1043	$tf = shift @fields;
	1044
	1045	# get the number of expansions
	1046	$countexp = shift @fields;
	1047
[1829]	1048	# get the expansions, convert them into phind-id numbers, and sort them
[1643]	1049	$expansions = shift @fields;
	1050	@newexp = ();
	1051	foreach $k (split(/,/, $expansions)) {
[1829]	1052	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
	1053	$n = $suffixtophind{$k};
[1643]	1054	push @newexp, $n;
	1055	}
[1829]	1056	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
[1643]	1057
	1058	# get the number of documents
	1059	$countdocs = shift @fields;
	1060
[1829]	1061	# get the documents and sort them
[1643]	1062	$documents = shift @fields;
	1063	$documents =~ s/d//g;
	1064	@documents = split(/;/, $documents);
[1829]	1065	@documents = sort by_doc_frequency @documents;
[1643]	1066
	1067	# output the phrase data
[1829]	1068	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
	1069	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
	1070
	1071	}
	1072
	1073	close IN;
	1074	close OUT;
	1075	}
	1076
	1077
	1078	# renumber_thesaurus_data
	1079	#
	1080	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
	1081
	1082	sub renumber_thesaurus_data {
	1083	my ($self) = @_;
	1084
[1839]	1085	my $out = $self->{'outhandle'};
[1829]	1086	my $verbosity = $self->{'verbosity'};
	1087	my $thesaurus = $self->{'thesaurus'};
	1088
	1089	my $phinddir = $self->{'phinddir'};
	1090	my $infile = &util::filename_cat($phinddir, "phrases.2");
	1091	my $outfile = &util::filename_cat($phinddir, "phrases.3");
	1092
	1093
	1094	# If no thesaurus is defined, simply move the phrases file.
	1095	if (!$thesaurus) {
	1096	print $out "Translate phrases.2: no thesaurus data\n"
	1097	if ($verbosity);
	1098	&util::mv($infile, $outfile);
	1099	return;
	1100	}
	1101
	1102	print $out "Translate phrases.2: add thesaurus data\n"
	1103	if ($verbosity);
	1104
	1105	# 1.
	1106	# Read thesaurus file and store (symbols->thesaurusid) mapping
	1107	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	1108	my %symbolstothesid;
	1109	my (@fields, $thesid, $symbols);
	1110
	1111	open(TH, "<$thesaurusfile");
	1112
	1113	while (<TH>) {
	1114
	1115	chomp;
	1116	@fields = split(/:/, $_);
	1117
	1118	# get id and text
	1119	$thesid = shift @fields;
	1120	$symbols = shift @fields;
	1121	$symbolstothesid{$symbols} = $thesid;
	1122	}
	1123	close TH;
	1124
	1125	# 2.
[1839]	1126	# Read phrases file to find thesaurus entries that already
[2064]	1127	# have a phindid. Store their phind-ids for later translation,
	1128	# and store their frequency for later sorting.
[1829]	1129	my %thesaurustophindid;
[2064]	1130	my %phindidtofrequency;
	1131	my ($phindid, $freq);
[1829]	1132
	1133	open(IN, "<$infile");
	1134
	1135	while(<IN>) {
	1136
	1137	chomp;
	1138	@fields = split(/:/, $_);
	1139
	1140	# phindid and symbols for this line
	1141	$phindid = shift @fields;
	1142	$symbols = shift @fields;
[2064]	1143	$freq = shift @fields;
[1829]	1144
	1145	# do we have a thesaurus id corresponding to this phrase?
	1146	if (defined($symbolstothesid{$symbols})) {
	1147	$thesid = $symbolstothesid{$symbols};
	1148	$thesaurustophindid{$thesid} = $phindid;
[2064]	1149	$phindidtofrequency{$phindid} = $freq;
[1829]	1150	}
	1151	}
	1152	close IN;
	1153
	1154	undef %symbolstothesid;
	1155
	1156	# 3.
[2064]	1157	# Create phind-id numbers for remaining thesaurus entries,
	1158	# and note that their frequency is 0 for later sorting.
[1829]	1159	my $nextphindid = $phindid + 1;
	1160
	1161	open(TH, "<$thesaurusfile");
	1162	while(<TH>) {
	1163
	1164	chomp;
	1165	@fields = split(/:/, $_);
	1166
	1167	# read thesaurus-id and ensure it has a corresponding phind-id
	1168	$thesid = shift @fields;
	1169	if (!defined($thesaurustophindid{$thesid})) {
	1170	$thesaurustophindid{$thesid} = $nextphindid;
[2064]	1171	$phindidtofrequency{$nextphindid} = 0;
[1829]	1172	$nextphindid++;
	1173	}
	1174	}
	1175	close TH;
	1176
	1177	# 4.
	1178	# Translate thesaurus file, replacing thesaurus-id numbers with
	1179	# phind-id numbers.
	1180	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
[2064]	1181	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
	1182	my (@links, $linkid, %linkidtotype, $newrelation);
[1829]	1183
	1184	open(TH, "<$thesaurusfile");
	1185	open(TO, ">$newthesaurusfile");
	1186	while(<TH>) {
	1187
	1188	chomp;
	1189	@fields = split(/:/, $_);
	1190
	1191	# phindid and symbols for this line
	1192	($thesid, $symbols, $relations) = @fields;
	1193
	1194	die unless ($thesid && $symbols);
	1195	die unless $thesaurustophindid{$thesid};
	1196	$phindid = $thesaurustophindid{$thesid};
	1197
	1198	# convert each part of the relation string to use phind-id numbers
[2064]	1199	# at the same time, we want to sort the list by frequency.
	1200	undef %linkidtotype;
	1201
[1829]	1202	foreach $linktext (split(/;/, $relations)) {
	1203	@linkdata = split(/,/, $linktext);
[2064]	1204
[1829]	1205	# remember the linktype (e.g. BT, NT)
	1206	$linktype = shift @linkdata;
[2064]	1207
	1208	# store the type of each link
	1209	foreach $thesid (@linkdata) {
	1210	die unless (defined($thesaurustophindid{$thesid}));
	1211	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
	1212	}
	1213	}
[1829]	1214
[2064]	1215	# sort the list of links, first by frequency, then by type.
	1216	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
	1217	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
	1218	$linkcounter = (scalar @links);
	1219
	1220	# create a string describing the link information
	1221	$linktype = $linkidtotype{$links[0]};
	1222	$newrelation = $linktype;
	1223	foreach $linkid (@links) {
	1224	if ($linkidtotype{$linkid} ne $linktype) {
	1225	$linktype = $linkidtotype{$linkid};
	1226	$newrelation .= ";" . $linktype;
[1829]	1227	}
[2064]	1228	$newrelation .= "," . $linkid;
[1829]	1229	}
[2064]	1230	$newrelation .= ";";
	1231
[1829]	1232
[2064]	1233	# output the new line
	1234	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
[1829]	1235	}
	1236	close TH;
	1237	close TO;
	1238
	1239	undef %thesaurustophindid;
[2064]	1240	undef %linkidtotype;
	1241	undef %phindidtofrequency;
[1829]	1242
	1243	# 5.
	1244	# Read thesaurus data (in phind-id format) into memory
	1245	my %thesaurusdata;
	1246
	1247	open(TH, "<$newthesaurusfile");
	1248	while(<TH>) {
	1249	chomp;
	1250	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
	1251	die unless ($phindid && $symbols);
	1252	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
	1253	}
[2487]	1254	close TH;
[1829]	1255
	1256	# 6.
	1257	# Add thesaurus data to phrases file
	1258	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
	1259	my (@documwents, @newexp, $k, $n);
	1260	my $linenumber = 0;
	1261
	1262	open(IN, "<$infile");
	1263	open(OUT, ">$outfile");
	1264
	1265	# Update existing phrases
	1266	while(<IN>) {
	1267
	1268	chomp;
	1269	@fields = split(/:/, $_);
	1270
	1271	# get data for this line
	1272	$phindid = shift @fields;
	1273
	1274	# output the phrase data, with thesaurus information
	1275	print OUT "$phindid:", join(":", @fields);
	1276
	1277	# add thesaurus data
	1278	if (defined($thesaurusdata{$phindid})) {
	1279	@fields = split(/:/, $thesaurusdata{$phindid});
	1280	shift @fields;
	1281	$linkcounter = shift @fields;
	1282	$relations = shift @fields;
	1283
	1284	print OUT ":$linkcounter:$relations";
	1285	$thesaurusdata{$phindid} = "";
	1286	}
	1287	print OUT "\n";
	1288	}
	1289	close IN;
	1290
	1291	# Add phrases that aren't already in the file
	1292	foreach $phindid (sort numerically keys %thesaurusdata) {
	1293	next unless ($thesaurusdata{$phindid});
	1294
	1295	@fields = split(/:/, $thesaurusdata{$phindid});
	1296	$symbols = shift @fields;
	1297	$linkcounter = shift @fields;
	1298	$relations = shift @fields;
	1299
	1300	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
	1301	}
	1302	close OUT;
	1303
	1304	}
	1305
	1306	# restore_vocabulary_data
	1307	#
	1308	# Read phrases.3 and restore vocabulary information. Then write
	1309	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
	1310	# (if requested) to the saved phrases file.
	1311
	1312	sub restore_vocabulary_data {
	1313	my ($self) = @_;
	1314
[1839]	1315	my $out = $self->{'outhandle'};
[1829]	1316	my $verbosity = $self->{'verbosity'};
	1317	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
	1318
	1319	my $phinddir = $self->{'phinddir'};
	1320	my $infile = &util::filename_cat($phinddir, 'phrases.3');
	1321	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
	1322	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
	1323	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
	1324
	1325	my $savephrases = $self->{'savephrases'};
	1326
	1327	# 1.
	1328	# Read the vocabulary file
	1329	open(V, "<$vocabfile")
	1330	\|\| die "Cannot open $vocabfile: $!";
	1331	my @symbol;
	1332	my $i = 1;
	1333	while(<V>) {
	1334	chomp;
	1335	$symbol[$i++] = $_;
	1336	}
[2487]	1337	close V;
[1829]	1338
	1339	# 2.
	1340	# Translate phrases.3 to MGPP input files
[2079]	1341	my ($key, $text, $word, $isThesaurus);
[1829]	1342	my @fields;
	1343	my $linenumber = 0;
	1344
	1345	open(IN, "<$infile");
	1346	open(DATA, ">$datafile");
	1347	open(WORD, ">$wordfile");
	1348
	1349	# Save the phrases in a separate text file
	1350	if ($savephrases) {
	1351	print $out "Saving phrases in $savephrases\n" if ($verbosity);
	1352	open(SAVE, ">$savephrases");
	1353	}
	1354
	1355	while(<IN>) {
	1356
	1357	# read the line
	1358	chomp;
	1359	$line = $_;
	1360	@fields = split(/:/, $line);
	1361
	1362	# get a phrase number for this line
	1363	$key = shift @fields;
	1364
	1365	# restore the text of the phrase
	1366	$text = shift @fields;
	1367	$text =~ s/s(\d+)/$symbol[$1]/g;
	1368	if ($text =~ / /) {
	1369	$word = "";
	1370	} elsif ($text ne 'untranslated') {
	1371	$word = $text;
	1372	}
	1373
	1374	# output the phrase data
[1643]	1375	print DATA "<Document>";
[1829]	1376	print DATA "$key:$text:", join(":", @fields), ":\n";
[1643]	1377
	1378	# output the word index search data
[1829]	1379	print WORD "<Document>$word\n";
[1643]	1380
[1808]	1381	# output the phrases to a text file
	1382	if ($savephrases) {
[2079]	1383	if ((scalar @fields) == 7) {
	1384	$isThesaurus = 1;
	1385	} else {
	1386	$isThesaurus = 0;
	1387	}
	1388	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
[1808]	1389	}
[1643]	1390	}
[1829]	1391	close IN;
	1392	close WORD;
	1393	close DATA;
	1394	close SAVE if ($savephrases);
[1808]	1395
[1643]	1396	}
	1397
[1829]	1398
	1399
[1643]	1400	# sort routines used to renumber phrases
	1401
	1402	sub numerically { $a <=> $b }
	1403
[1829]	1404	sub by_doc_frequency {
[1643]	1405	my $fa = 1;
	1406	if ($a =~ /,/) {
	1407	$fa = $a;
	1408	$fa =~ s/\d+,//;
	1409	}
	1410	my $fb = 1;
	1411	if ($b =~ /,/) {
	1412	$fb = $b;
	1413	$fb =~ s/\d+,//;
	1414	}
	1415
	1416	return ($fb <=> $fa);
	1417	}
	1418
	1419	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: