Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 23116

Last change on this file since 23116 was 23116, checked in by kjdon, 14 years ago
for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.
Property svn:keywords set to `Author Date Id Revision`
File size: 40.9 KB

Rev	Line
[3472]	1	###########################################################################
	2	#
	3	# Phind.pm -- the Phind classifier
	4	#
	5	# Copyright (C) 2000 Gordon W. Paynter
	6	# Copyright (C) 2000 New Zealand Digital Library Project
	7	#
	8	#
	9	# A component of the Greenstone digital library software
	10	# from the New Zealand Digital Library Project at the
	11	# University of Waikato, New Zealand.
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# The Phind clasifier plugin.
	30	# Type "classinfo.pl Phind" at the command line for a summary.
	31
	32	package Phind;
	33
[17209]	34	use BaseClassifier;
[3472]	35	use util;
	36	use ghtml;
	37	use unicode;
	38
[10253]	39	use strict;
	40	no strict 'refs'; # allow filehandles to be variables and viceversa
	41
[3472]	42	my @removedirs = ();
	43
	44	my %wanted_index_files = ('td'=>1,
	45	't'=>1,
	46	'ti'=>1,
	47	'tl'=>1,
	48	'tsd'=>1,
	49	'idb'=>1,
	50	'ib1'=>1,
	51	'ib2'=>1,
	52	'ib3'=>1,
	53	'i'=>1,
	54	'il'=>1,
	55	'w'=>1,
	56	'wa'=>1);
	57
	58	sub BEGIN {
[17209]	59	@Phind::ISA = ('BaseClassifier');
[3472]	60	}
	61
	62	sub END {
	63
	64	# Tidy up stray files - we do this here as there's some weird problem
	65	# preventing us from doing it in the get_classify_info() function (on
	66	# windows at least) where the close() appears to fail on txthandle and
	67	# dochandle, thus preventing us from deleting those files
	68
	69	foreach my $dir (@removedirs) {
	70	if (-d $dir && opendir (DIR, $dir)) {
	71	my @files = readdir DIR;
	72	closedir DIR;
	73
[10253]	74	foreach my $file (@files) {
[3472]	75	next if $file =~ /^\.\.?$/;
	76	my ($suffix) = $file =~ /\.([^\.]+)$/;
	77	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
	78	# delete it!
	79	&util::rm (&util::filename_cat ($dir, $file));
	80	}
	81	}
	82	}
	83	}
	84	}
	85
[4759]	86	my $arguments =
	87	[ { 'name' => "text",
[4873]	88	'desc' => "{Phind.text}",
[3540]	89	'type' => "string",
[4759]	90	'deft' => "section:Title,section:text",
	91	'reqd' => "no" },
	92	{ 'name' => "title",
[4873]	93	'desc' => "{Phind.title}",
[3540]	94	'type' => "metadata",
[4759]	95	'deft' => "Title",
	96	'reqd' => "no" },
[6968]	97	{ 'name' => "buttonname",
	98	'desc' => "{BasClas.buttonname}",
[3540]	99	'type' => "string",
[4759]	100	'deft' => "Phrase",
	101	'reqd' => "no" },
	102	{ 'name' => "language",
[4873]	103	'desc' => "{Phind.language}",
[6989]	104	'type' => "string",
[4759]	105	'deft' => "en",
	106	'reqd' => "no" },
	107	{ 'name' => "savephrases",
[4873]	108	'desc' => "{Phind.savephrases}",
[3540]	109	'type' => "string",
[4759]	110	'deft' => "",
	111	'reqd' => "no" },
	112	{ 'name' => "suffixmode",
[4873]	113	'desc' => "{Phind.suffixmode}",
[3540]	114	'type' => "int",
[4759]	115	'deft' => "1",
[8362]	116	'range' => "0,1",
[4759]	117	'reqd' => "no" },
[8362]	118	{ 'name' => "min_occurs",
	119	'desc' => "{Phind.min_occurs}",
	120	'type' => "int",
	121	'deft' => "2",
	122	'range' => "1,",
	123	'reqd' => "no" },
[4759]	124	{ 'name' => "thesaurus",
[4873]	125	'desc' => "{Phind.thesaurus}",
[3540]	126	'type' => "string",
[4759]	127	'deft' => "",
	128	'reqd' => "no" },
	129	{ 'name' => "untidy",
[4873]	130	'desc' => "{Phind.untidy}",
[3540]	131	'type' => "flag",
[4759]	132	'reqd' => "no" } ];
[3540]	133
[4759]	134	my $options = { 'name' => "Phind",
[5645]	135	'desc' => "{Phind.desc}",
[6408]	136	'abstract' => "no",
	137	'inherits' => "yes",
[4759]	138	'args' => $arguments };
[3540]	139
[3472]	140
	141	# Phrase delimiter symbols - these should be abstracted out someplace
	142
	143	my $colstart = "COLLECTIONSTART";
	144	my $colend = "COLLECTIONEND";
	145	my $doclimit = "DOCUMENTLIMIT";
	146	my $senlimit = "SENTENCELIMIT";
	147	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
	148
	149
	150	# Create a new Phind browser based on collect.cfg
	151
	152	sub new {
[10218]	153	my ($class) = shift (@_);
	154	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
	155	push(@$classifierslist, $class);
[3472]	156
[17209]	157	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	158	push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]	159
[17209]	160	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[6968]	161
[10253]	162	if ($self->{'info_only'}) {
	163	# don't worry about any options etc
	164	return bless $self, $class;
	165	}
	166
[3472]	167	# Ensure the Phind generate scripts are in place
	168	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
	169	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
	170	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
	171	if (!(-e $file1)) {
	172	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
	173	exit(1);
	174	}
	175
[20454]	176	# things that may have ex. in them that need to be stripped off
	177	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
	178	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
	179
[10218]	180	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
[20454]	181
[10218]	182	$self->{"indexes"} = $self->{"text"};
[3472]	183
[10218]	184	# Further setup
	185	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
	186	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
[6982]	187	if (! defined $self->{'builddir'}) {
	188	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
	189	}
[10218]	190	$self->{'total'} = 0;
	191
[22382]	192	# we set phind to be rtl if language is arabic
	193	if ($self->{'language'} eq "ar") {
	194	$self->{'textorientation'} = "rtl";
	195	}
[10218]	196	# Clean out the unused keys
	197	delete $self->{"text"};
[6982]	198
[3472]	199	return bless $self, $class;
	200	}
	201
	202
	203	# Initialise the Phind classifier
	204
	205	sub init {
	206	my $self = shift (@_);
	207
	208	# ensure we have a build directory
	209	my $builddir = $self->{'builddir'};
	210	die unless (-e "$builddir");
	211
	212	# create Phind directory
	213	my $phnumber = 1;
	214	my $phinddir = &util::filename_cat($builddir, "phind1");
	215	while (-e "$phinddir") {
	216	$phnumber++;
	217	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
	218	}
	219	&util::mk_dir("$phinddir");
	220	$self->{'phinddir'} = $phinddir;
	221	$self->{'phindnumber'} = $phnumber;
	222
	223	push(@removedirs, $phinddir) unless $self->{'untidy'};
	224
	225	# open filehandles for documents and text
	226	my $clausefile = &util::filename_cat("$phinddir", "clauses");
	227	&util::rm($clausefile) if (-e $clausefile);
	228
	229	my $txthandle = 'TEXT' . $phnumber;
	230	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
	231	$self->{'txthandle'} = $txthandle;
	232
	233	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
	234	&util::rm($docfile) if (-e $docfile);
	235
	236	my $dochandle = 'DOC' . $phnumber;
	237	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
	238	$self->{'dochandle'} = $dochandle;
	239
	240	}
	241
	242
	243	# Classify each document.
	244	#
	245	# Each document is passed here in turn. The classifier extracts the
	246	# text of each and stores it in the clauses file. Document details are
	247	# stored in the docs.txt file.
	248
	249	sub classify {
	250	my $self = shift (@_);
[23116]	251	my ($doc_obj) = @_;
[3472]	252
	253	my $verbosity = $self->{'verbosity'};
	254	my $top_section = $doc_obj->get_top_section();
	255
[10218]	256	my $titlefield = $self->{'title'};
[3472]	257
	258	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[3536]	259	if (!defined($title)) {
	260	$title = "";
	261	print STDERR "Phind: document has no title\n";
	262	}
[3472]	263	print "process: $title\n" if ($verbosity > 2);
	264
	265	# Only consider the file if it is in the correct language
	266	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
[10218]	267	my $phrlanguage = $self->{'language'};
[3472]	268	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
[18455]	269
[3472]	270	# record this file
	271	$self->{'total'} ++;
[10253]	272	# what is $file ???
	273	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[3472]	274
	275	# Store document details
	276	my $OID = $doc_obj->get_OID();
	277	$OID = "NULL" unless defined $OID;
	278	my $dochandle = $self->{'dochandle'};
	279	print $dochandle "<Document>\t$OID\t$title\n";
	280
	281	# Store the text occuring in this object
	282
	283	# output the document delimiter
	284	my $txthandle = $self->{'txthandle'};
	285	print $txthandle "$doclimit\n";
	286
[8362]	287	# iterate over the required indexes and store their text
[3472]	288	my $indexes = $self->{'indexes'};
	289	my $text = "";
	290	my ($part, $level, $field, $section, $data, $dataref);
	291
	292	foreach $part (split(/,/, $indexes)) {
	293
	294	# Each field has a level and a data element ((e.g. document:Title)
	295	($level, $field) = split(/:/, $part);
	296	die unless ($level && $field);
	297
	298	# Extract the text from every section
[3540]	299	# (In phind, document:text and section:text are equivalent)
[3472]	300	if ($field eq "text") {
	301	$data = "";
	302	$section = $doc_obj->get_top_section();
	303	while (defined($section)) {
	304	$data .= $doc_obj->get_text($section) . "\n";
	305	$section = $doc_obj->get_next_section($section);
	306	}
	307	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	308	}
	309
	310	# Extract a metadata field from a document
	311	# (If there is more than one element of the given type, get them all.)
	312	elsif ($level eq "document") {
	313	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
	314	foreach $data (@$dataref) {
	315	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	316	}
	317	}
	318
	319	# Extract metadata from every section in a document
	320	elsif ($level eq "section") {
	321	$data = "";
	322	$section = $doc_obj->get_top_section();
	323	while (defined($section)) {
	324	$dataref = $doc_obj->get_metadata($section, $field);
	325	$data .= join("\n", @$dataref) . "\n";
	326	$section = $doc_obj->get_next_section($section);
	327	}
	328	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	329	}
	330
	331	# Some sort of specification which I don't understand
	332	else {
	333	die "Unknown level ($level) in Phind index ($part)\n";
	334	}
	335
	336	}
	337
	338	# output the text
	339	$text =~ tr/\n//s;
	340	print $txthandle "$text";
	341	}
	342
	343
	344	# Construct the classifier from the information already gathered
	345	#
	346	# When get_classify_info is called, the clauses and docs.txt files have
[3540]	347	# already been constructed in the Phind directory. This function will
[3472]	348	# translate them into compressed, indexed MGPP files that can be read by
	349	# the phindcgi script. It will also register our classifier so that it
	350	# shows up in the navigation bar.
	351
	352	sub get_classify_info {
	353	my $self = shift (@_);
[6408]	354	my ($gli) = @_;
[3472]	355
	356	close $self->{'dochandle'};
	357	close $self->{'txthandle'};
	358	my $verbosity = $self->{'verbosity'};
	359	my $out = $self->{'outhandle'};
	360	my $phinddir = $self->{'phinddir'};
	361
	362	my $osextra = "";
	363	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
	364	$osextra = " -d /";
	365	}
	366
[6408]	367	print STDERR "</Stage>\n" if $gli;
	368
[3472]	369	if ($verbosity) {
	370	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
	371	print $out "*** in ", $self->{'phinddir'}, "\n";
	372	}
	373
[6408]	374	print STDERR "<Stage name='Phind'>\n" if $gli;
	375
[3540]	376	# Construct phind indexes
[3472]	377	my $suffixmode = $self->{'suffixmode'};
[8362]	378	my $min_occurs = $self->{'min_occurs'};
[3472]	379	my ($command, $status);
	380
	381	# Generate the vocabulary, symbol statistics, and numbers file
	382	# from the clauses file
	383	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[6408]	384	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
[3472]	385	&extract_vocabulary($self);
[3540]	386
[3472]	387	# Use the suffix program to generate the phind/phrases file
	388	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[6408]	389	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
[8362]	390	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
[3472]	391
	392	# check that we generated some files. It's not necessarily an error if
	393	# we didn't (execute() would have quit on error), but we can't go on.
	394	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
	395	if (! -r $phrasesfile) {
[6408]	396	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
[3472]	397	print $out "\nNo phrases found for Phind classifier!\n";
	398	return;
[3540]	399	}
[3472]	400
	401	# Create the phrase file and put phrase numbers in phind/phrases
	402	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[6408]	403	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
[3472]	404	&renumber_phrases($self);
	405
	406	print $out "\nCreating phrase databases\n";
[6408]	407	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
[3472]	408	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
	409	my $mg_stem = &util::filename_cat($phinddir, "pdata");
	410
	411	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	412	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	413	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	414
	415	# create the mg index of words
	416	print $out "\nCreating word-level search indexes\n";
[6408]	417	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
[3472]	418	$mg_input = &util::filename_cat($phinddir, "pword.txt");
	419	$mg_stem = &util::filename_cat($phinddir, "pword");
	420
	421	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
	422	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	423	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	424	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
	425	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	426	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	427
	428	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
	429	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
	430	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
	431
	432	# create the mg document information database
	433	print $out "\nCreating document information databases\n";
[6408]	434	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
[3472]	435	$mg_input = &util::filename_cat($phinddir, "docs.txt");
	436	$mg_stem = &util::filename_cat($phinddir, "docs");
	437
	438	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	439	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	440	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	441
[22382]	442	my $parameters = "phindnumber=$self->{'phindnumber'}";
	443	if (defined ($self->{'textorientation'})) {
	444	$parameters .= ";textorientation=$self->{'textorientation'}";
	445	}
[3472]	446	# Return the information about the classifier that we'll later want to
	447	# use to create macros when the Phind classifier document is displayed.
	448	my %classifyinfo = ('thistype'=>'Invisible',
	449	'childtype'=>'Phind',
	450	'Title'=>$self->{'buttonname'},
[22382]	451	'parameters'=>$parameters,
[3472]	452	'contains'=>[]);
	453
	454	my $collection = $self->{'collection'};
	455	my $url = "library?a=p&p=phind&c=$collection";
	456	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
	457
	458	return \%classifyinfo;
	459	}
	460
	461
	462
	463	sub convert_gml_to_tokens {
	464
	465	my ($language_exp, $text) = @_;
	466
	467	# escape any magic words... - jrm21
	468	foreach my $delim (@delimiters) {
	469	my $replacement=lc($delim);
	470	my $num= $text=~ s/$delim/$replacement/g;
	471	if (!$num) {$num=0;}
	472	}
	473
[21876]	474	if ($language_exp =~ /^en$/) {
[3472]	475	return &convert_gml_to_tokens_EN($text);
	476	}
[3540]	477
[3472]	478	if ($language_exp =~ /zh/) {
	479	return &convert_gml_to_tokens_ZH($text);
[3540]	480	}
[3472]	481
	482	$_ = $text;
	483
	484	# 1. remove GML tags
	485
	486	# Remove everything that is in a tag
	487	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	488	s/\s<br>\s/ LINEBREAK /isgo;
	489	s/<[^>]*>/ /sgo;
	490
	491	# Now we have the text, but it may contain HTML
	492	# elements coded as > etc. Remove these tags.
	493	s/&/&/sgo;
	494	s/</</sgo;
	495	s/>/>/sgo;
	496	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	497	s/\s<br>\s/ LINEBREAK /isgo;
	498	s/<[^>]*>/ /sgo;
	499
	500	# replace<p> and <br> placeholders with clause break symbol (\n)
	501	s/\s+/ /gso;
	502	s/PARAGRAPHBREAK/\n/sgo;
	503	s/LINEBREAK/\n/sgo;
	504
	505
	506	# 2. Split the remaining text into space-delimited tokens
	507
[3540]	508	# Convert any HTML special characters (like ") to their UTF8 equivalent
	509	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[3472]	510
	511	# Split text at word boundaries
	512	s/\b/ /go;
[21876]	513
[3472]	514	# 3. Convert the remaining text to "clause format"
	515
	516	# Insert newline if the end of a sentence is detected
	517	# (delimter is: "[\.\?\!]\s")
	518	# s/\s*[\.\?\!]\s+/\n/go;
	519
	520	# remove unnecessary punctuation and replace with clause break symbol (\n)
[21876]	521	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
	522	#s/[^\w ]/\n/go;
	523	# replace punct with new lines - is this what we want??
	524	s/\s[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s/\n/go; #"
	525	# then remove other punct with space
	526	s/[\'\`\\\_]/ /go;
[3472]	527
	528	# remove extraneous whitespace
	529	s/ +/ /sgo;
	530	s/^\s+//mgo;
	531	s/\s*$/\n/mgo;
	532
	533	# remove lines that contain one word or less
	534	s/^\S*$//mgo;
	535	s/^\s*$//mgo;
	536	tr/\n//s;
	537
	538	return $_;
	539	}
	540
	541	# a chinese version
	542	sub convert_gml_to_tokens_ZH {
	543
	544	$_ = shift @_;
	545
	546	# Replace all whitespace with a simple space
	547	s/\s+/ /gs;
	548	# Remove everything that is in a tag
	549	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	550	s/\s<br>\s/ LINEBREAK /isg;
	551	s/<[^>]*>/ /sg;
	552
	553	# Now we have the text, but it may contain HTML
	554	# elements coded as > etc. Remove these tags.
	555	s/</</sg;
	556	s/>/>/sg;
	557
	558	s/\s+/ /sg;
	559	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	560	s/\s<br>\s/ LINEBREAK /isg;
	561	s/<[^>]*>/ /sg;
	562
	563	# remove & and other miscellaneous markup tags
	564	s/&/&/sg;
	565	s/</</sg;
	566	s/>/>/sg;
	567	s/&/&/sg;
	568
	569	# replace<p> and <br> placeholders with carriage returns
	570	s/PARAGRAPHBREAK/\n/sg;
	571	s/LINEBREAK/\n/sg;
	572
	573
	574	# print STDERR "text:$_\n";
	575	return $_;
	576	}
[3540]	577
[3472]	578	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
	579
	580	sub convert_gml_to_tokens_EN {
	581	$_ = shift @_;
	582
	583	# FIRST, remove GML tags
	584
	585	# Replace all whitespace with a simple space
	586	s/\s+/ /gs;
	587
	588	# Remove everything that is in a tag
	589	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	590	s/\s<br>\s/ LINEBREAK /isg;
	591	s/<[^>]*>/ /sg;
	592
	593	# Now we have the text, but it may contain HTML
	594	# elements coded as > etc. Remove these tags.
	595	s/</</sg;
	596	s/>/>/sg;
	597
	598	s/\s+/ /sg;
	599	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	600	s/\s<br>\s/ LINEBREAK /isg;
	601	s/<[^>]*>/ /sg;
	602
	603	# remove & and other miscellaneous markup tags
	604	s/&/&/sg;
	605	s/</</sg;
	606	s/>/>/sg;
	607	s/&/&/sg;
	608
	609	# replace<p> and <br> placeholders with carriage returns
	610	s/PARAGRAPHBREAK/\n/sg;
	611	s/LINEBREAK/\n/sg;
	612
	613
	614	# Exceptional punctuation
	615	#
	616	# We make special cases of some punctuation
	617
	618	# remove any apostrophe that indicates omitted letters
	619	s/(\w+)\'(\w*\s)/ $1$2 /g;
	620
	621	# remove period that appears in a person's initals
	622	s/\s([A-Z])\./ $1 /g;
	623
	624	# replace hyphens in hypheanted words and names with a space
	625	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
	626
	627	# Convert the remaining text to "clause format",
	628	# This means removing all excess punctuation and garbage text,
	629	# normalising valid punctuation to fullstops and commas,
	630	# then putting one cluse on each line.
	631
	632	# Insert newline when the end of a sentence is detected
	633	# (delimter is: "[\.\?\!]\s")
	634	s/\s*[\.\?\!]\s+/\n/g;
	635
	636	# split numbers after four digits
	637	s/(\d\d\d\d)/$1 /g;
	638
	639	# split words after 32 characters
	640
	641	# squash repeated punctuation
	642	tr/A-Za-z0-9 //cs;
	643
	644	# save email addresses
	645	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
	646
	647	# normalise clause breaks (mostly punctuation symbols) to commas
	648	s/[^A-Za-z0-9 \n]+/ , /g;
	649
	650	# Remove repeated commas, and replace with newline
	651	s/\s*,[, ]+/\n/g;
	652
	653	# remove extra whitespace
	654	s/ +/ /sg;
	655	s/^\s+//mg;
	656	s/\s*$/\n/mg;
	657
	658	# remove lines that contain one word or less
	659	s/^\w*$//mg;
	660	s/^\s*$//mg;
	661	tr/\n//s;
	662
	663	return $_;
	664
	665	}
	666
	667
	668
	669	# Execute a system command
	670
	671	sub execute {
	672	my ($command, $verbosity, $outhandle) = @_;
	673	print $outhandle "Executing: $command\n" if ($verbosity > 2);
	674	$! = 0;
	675	my $status = system($command);
	676	if ($status != 0) {
	677	print STDERR "Phind - Error executing '$command': $!\n";
[3540]	678	exit($status); # this causes the build to fail...
[3472]	679	}
	680	}
	681
	682
	683	# Generate the vocabulary, symbol statistics, and numbers file from the
	684	# clauses file. This is legacy code, so is a bit messy and probably wont
	685	# run under windows.
	686
	687	sub extract_vocabulary {
	688	my ($self) = @_;
	689
	690	my $verbosity = $self->{'verbosity'};
	691	my $out = $self->{'outhandle'};
	692
	693	my $collectiondir = $self->{'collectiondir'};
	694	my $phinddir = $self->{'phinddir'};
	695
[10218]	696	my $language_exp = $self->{'language'};
[3472]	697
	698	my ($w, $l, $line, $word);
	699
	700	my ($first_delimiter, $last_delimiter,
	701	$first_stopword, $last_stopword,
	702	$first_extractword, $last_extractword,
	703	$first_contentword, $last_contentword,
	704	$phrasedelimiter);
	705
	706	my $thesaurus = $self->{'thesaurus'};
	707	my ($thesaurus_links, $thesaurus_terms,
	708	%thesaurus, $first_thesaurusword, $last_thesaurusword);
	709
	710	my %symbol;
	711	my (%freq);
	712
	713	print $out "Calculating vocabulary\n" if ($verbosity > 1);
	714
	715	# Read and store the stopwords
	716	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
	717	my $stopword_files = ();
	718	my ($language, $language_dir, $file, $file_name);
	719	my %stopwords;
	720
	721	# Examine each directory in the stopword directory
	722	opendir(STOPDIR, $stopdir);
	723	foreach $language (readdir STOPDIR) {
	724
	725	# Ignore entries that do not match the classifier's language
	726	next unless ($language =~ /$language_exp/);
	727	$language_dir = &util::filename_cat($stopdir, $language);
	728	next unless (-d "$language_dir");
	729
	730	opendir(LANGDIR, $language_dir);
	731	foreach $file (readdir LANGDIR) {
	732
	733	# Ignore entries that are not stopword files
	734	next unless ($file =~ /sw$/);
	735	$file_name = &util::filename_cat($language_dir, $file);
	736	next unless (-f "$file_name");
	737
	738	# Read the stopwords
	739	open(STOPFILE, "<$file_name");
	740	while (<STOPFILE>) {
	741	s/^\s+//;
	742	s/\s.*//;
	743	$word = $_;
	744	$l = lc($word);
	745	$stopwords{$l} = $word;
	746	}
	747	close STOPFILE;
	748
	749	}
	750	closedir LANGDIR;
	751	}
	752	closedir STOPDIR;
	753
	754	# Read thesaurus information
	755	if ($thesaurus) {
	756
	757	# link file exists
	758	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
	759	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
	760
	761	# ensure term file exists in the correct language
	762	if ($language_exp =~ /^([a-z][a-z])/) {
	763	$language = $1;
	764	} else {
	765	$language = 'en';
	766	}
	767	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
	768	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
	769
	770
	771	# Read the thesaurus terms
	772	open(TH, "<$thesaurus_terms");
	773	while(<TH>) {
	774	s/^\d+ //;
	775	s/\(.*\)//;
	776	foreach $w (split(/\s+/, $_)) {
	777	$thesaurus{lc($w)} = $w;
	778	}
	779	}
	780	close TH;
	781	}
	782
	783	# Read words in the text and count occurences
	784	open(TXT, "<$phinddir/clauses");
	785
	786	my @words;
	787	while(<TXT>) {
	788	$line = $_;
	789	next unless ($line =~ /./);
	790
	791	@words = split(/\s+/, $line);
	792	foreach $w (@words) {
	793	$l = lc($w);
	794	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
	795	$freq{$w}++;
	796	}
	797	$freq{$senlimit}++;
	798	}
	799
	800	close TXT;
	801
	802	# Calculate the "best" form of each word
	803	my (%bestform, %totalfreq, %bestfreq);
	804
	805	foreach $w (sort (keys %freq)) {
	806	$l = lc($w);
	807
	808	# totalfreq is the number of times a term appears in any form
	809	$totalfreq{$l} += $freq{$w};
	810
	811	if (defined $stopwords{$l}) {
	812	$bestform{$l} = $stopwords{$l};
	813
	814	} elsif (defined $thesaurus{$l}) {
	815	$bestform{$l} = $thesaurus{$l};
	816
	817	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
	818	$bestfreq{$l} = $freq{$w};
	819	$bestform{$l} = $w;
	820	}
	821	}
	822	undef %freq;
	823	undef %bestfreq;
	824
	825
	826	# Assign symbol numbers to tokens
	827	my $nextsymbol = 1;
	828	my (@vocab);
	829
	830	# Delimiters
	831	$first_delimiter = 1;
	832
	833	foreach $word (@delimiters) {
	834
	835	# $word = lc($word); # jrm21
	836	$word = uc($word);
	837	$bestform{$word} = $word;
	838	$vocab[$nextsymbol] = $word;
	839	$symbol{$word} = $nextsymbol;
	840	$nextsymbol++;
	841	}
	842	$last_delimiter = $nextsymbol - 1;
	843	# Stopwords
	844	$first_stopword = $nextsymbol;
	845
	846	foreach my $word (sort keys %stopwords) {
	847	# don't include stopword unless it occurs in the text
	848	$word = lc($word);
	849	next unless ($totalfreq{$word});
	850	next if ($symbol{$word});
	851
	852	$vocab[$nextsymbol] = $word;
	853	$symbol{$word} = $nextsymbol;
	854	$nextsymbol++;
	855	}
	856	$last_stopword = $nextsymbol - 1;
	857	$first_contentword = $nextsymbol;
	858
	859	# Thesaurus terms
	860	if ($thesaurus) {
	861	$first_thesaurusword = $nextsymbol;
	862
	863	foreach my $word (sort keys %thesaurus) {
	864
	865	$word = lc($word);
	866	next if ($symbol{$word});
	867	$bestform{$word} = $thesaurus{$word};
	868
	869	$vocab[$nextsymbol] = $word;
	870	$symbol{$word} = $nextsymbol;
	871	$nextsymbol++;
	872
	873	}
	874	$last_thesaurusword = $nextsymbol - 1;
	875	}
	876
	877	# Other content words
	878	$first_extractword = $nextsymbol;
	879
	880	foreach my $word (sort (keys %bestform)) {
	881
	882	next if ($symbol{$word});
	883
	884	$vocab[$nextsymbol] = $word;
	885	$symbol{$word} = $nextsymbol;
	886	$nextsymbol++;
	887	}
	888	$last_extractword = $nextsymbol - 1;
	889	$last_contentword = $nextsymbol - 1;
	890
	891	# Outut the words
	892	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
	893	open(VOC, ">$phinddir/clauses.vocab");
	894
	895	for (my $i = 1; $i < $nextsymbol; $i++) {
	896	$w = $vocab[$i];
	897
	898	print VOC "$bestform{$w}\n";
	899	$totalfreq{$w} = 0 unless ($totalfreq{$w});
	900	}
	901	close VOC;
	902
	903
	904	# Create statistics file
	905	# Output statistics about the vocablary
	906	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
	907	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
	908
	909	open(STAT, ">$phinddir/clauses.stats")
	910	\|\| die "Cannot open $phinddir/clauses.stats: $!";
	911
	912	print STAT "first_delimiter $first_delimiter\n";
	913	print STAT "last_delimiter $last_delimiter\n";
	914	print STAT "first_stopword $first_stopword\n";
	915	print STAT "last_stopword $last_stopword\n";
	916	if ($thesaurus) {
	917	print STAT "first_thesaurusword $first_thesaurusword\n";
	918	print STAT "last_thesaurusword $last_thesaurusword\n";
	919	}
	920	print STAT "first_extractword $first_extractword\n";
	921	print STAT "last_extractword $last_extractword\n";
	922	print STAT "first_contentword $first_contentword\n";
	923	print STAT "last_contentword $last_contentword\n";
	924	print STAT "first_symbol $first_delimiter\n";
	925	print STAT "last_symbol $last_contentword\n";
	926	print STAT "first_word $first_stopword\n";
	927	print STAT "last_word $last_contentword\n";
	928	close STAT;
	929
	930	undef @vocab;
	931
	932
	933	# Create numbers file
	934	# Save text as symbol numbers
	935	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
	936
	937	open(TXT, "<$phinddir/clauses");
	938	open(NUM, ">$phinddir/clauses.numbers");
	939
	940	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
	941	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
	942	$phrasedelimiter = $symbol{$senlimit};
	943	print NUM "$symbol{$colstart}\n";
	944
	945	# set up the special symbols that delimit documents and sentences
	946	while(<TXT>) {
	947
	948	# split sentence into a list of tokens
	949	$line = $_;
	950	next unless ($line =~ /./);
	951	@words = split(/\s+/, $line);
	952
	953	# output one token at a time
	954	foreach $word (@words) {
	955	# don't lower-case special delimiters - jrm21
	956	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
	957	$word = lc($word);
	958	}
	959	print NUM "$symbol{$word}\n";
	960	}
	961
	962	# output phrase delimiter
	963	print NUM "$phrasedelimiter\n";
	964	}
	965
	966	close TXT;
	967	# print NUM "$symbol{lc($colend)}\n";# jrm21
	968	print NUM "$symbol{$colend}\n";
	969	close NUM;
	970
	971	# Save thesaurus data in one convienient file
	972	if ($thesaurus) {
	973
	974	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	975
	976
	977	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
	978	if ($verbosity > 1);
	979
	980	# Read the thesaurus terms
	981	my ($num, $text, %thes_symbols);
	982
	983	open(TH, "<$thesaurus_terms");
	984	while(<TH>) {
	985	chomp;
	986	@words = split(/\s+/, $_);
	987	$num = shift @words;
	988	$text = "";
	989
	990	# translate words into symbol numbers
	991	foreach $word (@words) {
	992	$word = lc($word);
	993	if ($symbol{$word}) {
	994	$text .= "s$symbol{$word} ";
	995	} elsif ($verbosity) {
	996	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
	997	}
	998	}
	999	$text =~ s/ $//;
	1000	$thes_symbols{$num} = $text;
	1001	}
	1002	close TH;
	1003
	1004	# Read the thesaurus links and write the corresponding data
	1005	open(TH, "<$thesaurus_links");
	1006	open(THOUT, ">$thesaurusfile");
	1007
	1008	while(<TH>) {
	1009	chomp;
	1010	($num, $text) = split(/:/, $_);
	1011
	1012	if (defined($thes_symbols{$num})) {
	1013	print THOUT "$num:$thes_symbols{$num}:$text\n";
	1014	} else {
	1015	print THOUT "$num:untranslated:$text\n";
	1016	}
	1017	}
	1018	close TH;
	1019	close THOUT;
	1020	}
	1021
	1022
	1023
	1024
	1025	}
	1026
	1027
	1028	# renumber_phrases
	1029	#
	1030	# Prepare the phrases file to be input to mgpp. The biggest problem is
	1031	# reconciling the phrase identifiers used by the suffix program (which
	1032	# we'll call suffix-id numbers) with the numbers used in the thesaurus
	1033	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
	1034	# Phind-id numbers must be sorted by frequency of occurance.
	1035	#
	1036	# Start creating a set of phind-id numbers from the sorted suffix-id
	1037	# numbers and (if required) the thesaurus-id numbers. Then add any other
	1038	# phrases occuring in the thesaurus.
	1039	#
	1040	# The last thing we have to do is restore the vocabulary information to the
	1041	# phrase file so that the phrases are stored as words, not as symbol
	1042	# numbers.
	1043
	1044	# The original phrases file looks something like this:
	1045	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
	1046	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
	1047	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
	1048	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
	1049
	1050
	1051	sub renumber_phrases {
	1052	my ($self) = @_;
	1053
	1054	renumber_suffix_data($self);
	1055	renumber_thesaurus_data($self);
	1056	restore_vocabulary_data($self);
	1057
	1058	}
	1059
	1060
	1061
	1062	# renumber_suffix_data
	1063	#
	1064	# Translate phrases file to phrases.2 using phind keys instead
	1065	# of suffix keys and sorting the expansion data.
	1066
	1067	sub renumber_suffix_data {
	1068	my ($self) = @_;
	1069
	1070	my $verbosity = $self->{'verbosity'};
	1071	my $out = $self->{'outhandle'};
	1072	print $out "Translate phrases: suffix-ids become phind-id's\n"
	1073	if ($verbosity);
	1074
	1075	my $phinddir = $self->{'phinddir'};
	1076	my $infile = &util::filename_cat($phinddir, 'phrases');
	1077	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
	1078
	1079	# Read the phrase file. Calculate initial set of phind-id
	1080	# numbers and store (suffixid -> frequency) relation.
	1081
	1082	my %suffixtophind;
[10253]	1083	my @totalfrequency;
[3472]	1084	my (@fields, $suffixid);
	1085	my $nextphind = 1;
	1086
	1087	open(IN, "<$infile");
	1088	while(<IN>) {
	1089
	1090	chomp;
	1091	@fields = split(/:/, $_);
	1092
	1093	# get next suffixid and phindid
	1094	$suffixid = shift @fields;
	1095	$suffixtophind{$suffixid} = $nextphind;
	1096
	1097	# store total frequency
	1098	shift @fields;
	1099	$totalfrequency[$nextphind] = shift @fields;
	1100
	1101	$nextphind++;
	1102	}
	1103	close IN;
	1104
	1105
	1106	# Translate phrases file to phrases.2. Use phind keys (not suffix
	1107	# keys), sort expansion and document occurance data in order of
	1108	# descending frequency..
	1109	open(IN, "<$infile");
	1110	open(OUT, ">$outfile");
	1111
	1112	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
	1113	my (@documwents, @newexp, $k, $n);
	1114	my $linenumber = 0;
	1115
	1116	while(<IN>) {
	1117
	1118	# read the line
	1119	chomp;
	1120	@fields = split(/:/, $_);
	1121
	1122	# get a phrase number for this line
	1123	$suffixid = shift @fields;
	1124	die unless (defined($suffixtophind{$suffixid}));
	1125	$phindid = $suffixtophind{$suffixid};
	1126
	1127	# get the symbols in the phrase
	1128	$text = shift @fields;
	1129
	1130	# output status information
	1131	$linenumber++;
	1132	if ($verbosity > 2) {
	1133	if ($linenumber % 1000 == 0) {
	1134	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
[10253]	1135	}
	1136	# what are $num and $key??
	1137	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[3472]	1138	}
	1139
	1140	# get the phrase frequency
	1141	$tf = shift @fields;
	1142
	1143	# get the number of expansions
	1144	$countexp = shift @fields;
	1145
	1146	# get the expansions, convert them into phind-id numbers, and sort them
	1147	$expansions = shift @fields;
	1148	@newexp = ();
	1149	foreach $k (split(/,/, $expansions)) {
	1150	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
	1151	$n = $suffixtophind{$k};
	1152	push @newexp, $n;
	1153	}
	1154	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
	1155
	1156	# get the number of documents
	1157	$countdocs = shift @fields;
	1158
	1159	# get the documents and sort them
	1160	$documents = shift @fields;
	1161	$documents =~ s/d//g;
[10253]	1162	my @documents = split(/;/, $documents);
[3472]	1163	@documents = sort by_doc_frequency @documents;
	1164
	1165	# output the phrase data
	1166	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
	1167	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
	1168
	1169	}
	1170
	1171	close IN;
	1172	close OUT;
	1173	}
	1174
	1175
	1176	# renumber_thesaurus_data
	1177	#
	1178	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
	1179
	1180	sub renumber_thesaurus_data {
	1181	my ($self) = @_;
	1182
	1183	my $out = $self->{'outhandle'};
	1184	my $verbosity = $self->{'verbosity'};
	1185	my $thesaurus = $self->{'thesaurus'};
	1186
	1187	my $phinddir = $self->{'phinddir'};
	1188	my $infile = &util::filename_cat($phinddir, "phrases.2");
	1189	my $outfile = &util::filename_cat($phinddir, "phrases.3");
	1190
	1191
	1192	# If no thesaurus is defined, simply move the phrases file.
	1193	if (!$thesaurus) {
	1194	print $out "Translate phrases.2: no thesaurus data\n"
	1195	if ($verbosity);
	1196	&util::mv($infile, $outfile);
	1197	return;
	1198	}
	1199
	1200	print $out "Translate phrases.2: add thesaurus data\n"
	1201	if ($verbosity);
	1202
	1203	# 1.
	1204	# Read thesaurus file and store (symbols->thesaurusid) mapping
	1205	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	1206	my %symbolstothesid;
	1207	my (@fields, $thesid, $symbols);
	1208
	1209	open(TH, "<$thesaurusfile");
	1210
	1211	while (<TH>) {
	1212
	1213	chomp;
	1214	@fields = split(/:/, $_);
	1215
	1216	# get id and text
	1217	$thesid = shift @fields;
	1218	$symbols = shift @fields;
	1219	$symbolstothesid{$symbols} = $thesid;
	1220	}
	1221	close TH;
	1222
	1223	# 2.
	1224	# Read phrases file to find thesaurus entries that already
	1225	# have a phindid. Store their phind-ids for later translation,
	1226	# and store their frequency for later sorting.
	1227	my %thesaurustophindid;
	1228	my %phindidtofrequency;
	1229	my ($phindid, $freq);
	1230
	1231	open(IN, "<$infile");
	1232
	1233	while(<IN>) {
	1234
	1235	chomp;
	1236	@fields = split(/:/, $_);
	1237
	1238	# phindid and symbols for this line
	1239	$phindid = shift @fields;
	1240	$symbols = shift @fields;
	1241	$freq = shift @fields;
	1242
	1243	# do we have a thesaurus id corresponding to this phrase?
	1244	if (defined($symbolstothesid{$symbols})) {
	1245	$thesid = $symbolstothesid{$symbols};
	1246	$thesaurustophindid{$thesid} = $phindid;
	1247	$phindidtofrequency{$phindid} = $freq;
	1248	}
	1249	}
	1250	close IN;
	1251
	1252	undef %symbolstothesid;
	1253
	1254	# 3.
	1255	# Create phind-id numbers for remaining thesaurus entries,
	1256	# and note that their frequency is 0 for later sorting.
	1257	my $nextphindid = $phindid + 1;
	1258
	1259	open(TH, "<$thesaurusfile");
	1260	while(<TH>) {
	1261
	1262	chomp;
	1263	@fields = split(/:/, $_);
	1264
	1265	# read thesaurus-id and ensure it has a corresponding phind-id
	1266	$thesid = shift @fields;
	1267	if (!defined($thesaurustophindid{$thesid})) {
	1268	$thesaurustophindid{$thesid} = $nextphindid;
	1269	$phindidtofrequency{$nextphindid} = 0;
	1270	$nextphindid++;
	1271	}
	1272	}
	1273	close TH;
	1274
	1275	# 4.
	1276	# Translate thesaurus file, replacing thesaurus-id numbers with
	1277	# phind-id numbers.
	1278	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
	1279	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
	1280	my (@links, $linkid, %linkidtotype, $newrelation);
	1281
	1282	open(TH, "<$thesaurusfile");
	1283	open(TO, ">$newthesaurusfile");
	1284	while(<TH>) {
	1285
	1286	chomp;
	1287	@fields = split(/:/, $_);
	1288
	1289	# phindid and symbols for this line
	1290	($thesid, $symbols, $relations) = @fields;
	1291
	1292	die unless ($thesid && $symbols);
	1293	die unless $thesaurustophindid{$thesid};
	1294	$phindid = $thesaurustophindid{$thesid};
	1295
	1296	# convert each part of the relation string to use phind-id numbers
	1297	# at the same time, we want to sort the list by frequency.
	1298	undef %linkidtotype;
	1299
	1300	foreach $linktext (split(/;/, $relations)) {
	1301	@linkdata = split(/,/, $linktext);
	1302
	1303	# remember the linktype (e.g. BT, NT)
	1304	$linktype = shift @linkdata;
	1305
	1306	# store the type of each link
	1307	foreach $thesid (@linkdata) {
	1308	die unless (defined($thesaurustophindid{$thesid}));
	1309	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
	1310	}
	1311	}
	1312
	1313	# sort the list of links, first by frequency, then by type.
	1314	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
	1315	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
	1316	$linkcounter = (scalar @links);
	1317
	1318	# create a string describing the link information
	1319	$linktype = $linkidtotype{$links[0]};
	1320	$newrelation = $linktype;
	1321	foreach $linkid (@links) {
	1322	if ($linkidtotype{$linkid} ne $linktype) {
	1323	$linktype = $linkidtotype{$linkid};
	1324	$newrelation .= ";" . $linktype;
	1325	}
	1326	$newrelation .= "," . $linkid;
	1327	}
	1328	$newrelation .= ";";
	1329
	1330
	1331	# output the new line
	1332	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
	1333	}
	1334	close TH;
	1335	close TO;
	1336
	1337	undef %thesaurustophindid;
	1338	undef %linkidtotype;
	1339	undef %phindidtofrequency;
	1340
	1341	# 5.
	1342	# Read thesaurus data (in phind-id format) into memory
	1343	my %thesaurusdata;
	1344
	1345	open(TH, "<$newthesaurusfile");
	1346	while(<TH>) {
	1347	chomp;
	1348	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
	1349	die unless ($phindid && $symbols);
	1350	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
	1351	}
	1352	close TH;
	1353
	1354	# 6.
	1355	# Add thesaurus data to phrases file
	1356	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
	1357	my (@documwents, @newexp, $k, $n);
	1358	my $linenumber = 0;
	1359
	1360	open(IN, "<$infile");
	1361	open(OUT, ">$outfile");
	1362
	1363	# Update existing phrases
	1364	while(<IN>) {
	1365
	1366	chomp;
	1367	@fields = split(/:/, $_);
	1368
	1369	# get data for this line
	1370	$phindid = shift @fields;
	1371
	1372	# output the phrase data, with thesaurus information
	1373	print OUT "$phindid:", join(":", @fields);
	1374
	1375	# add thesaurus data
	1376	if (defined($thesaurusdata{$phindid})) {
	1377	@fields = split(/:/, $thesaurusdata{$phindid});
	1378	shift @fields;
	1379	$linkcounter = shift @fields;
	1380	$relations = shift @fields;
	1381
	1382	print OUT ":$linkcounter:$relations";
	1383	$thesaurusdata{$phindid} = "";
	1384	}
	1385	print OUT "\n";
	1386	}
	1387	close IN;
	1388
	1389	# Add phrases that aren't already in the file
	1390	foreach $phindid (sort numerically keys %thesaurusdata) {
	1391	next unless ($thesaurusdata{$phindid});
	1392
	1393	@fields = split(/:/, $thesaurusdata{$phindid});
	1394	$symbols = shift @fields;
	1395	$linkcounter = shift @fields;
	1396	$relations = shift @fields;
	1397
	1398	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
	1399	}
	1400	close OUT;
	1401
	1402	}
	1403
	1404	# restore_vocabulary_data
	1405	#
	1406	# Read phrases.3 and restore vocabulary information. Then write
[8362]	1407	# this data to the MGPP input files (pword.txt and pdata.txt) and
[3472]	1408	# (if requested) to the saved phrases file.
	1409
	1410	sub restore_vocabulary_data {
	1411	my ($self) = @_;
	1412
	1413	my $out = $self->{'outhandle'};
	1414	my $verbosity = $self->{'verbosity'};
	1415	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
	1416
	1417	my $phinddir = $self->{'phinddir'};
	1418	my $infile = &util::filename_cat($phinddir, 'phrases.3');
	1419	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
	1420	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
	1421	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
	1422
	1423	my $savephrases = $self->{'savephrases'};
	1424
	1425	# 1.
	1426	# Read the vocabulary file
	1427	open(V, "<$vocabfile")
	1428	\|\| die "Cannot open $vocabfile: $!";
	1429	my @symbol;
	1430	my $i = 1;
	1431	while(<V>) {
	1432	chomp;
	1433	$symbol[$i++] = $_;
	1434	}
	1435	close V;
	1436
	1437	# 2.
	1438	# Translate phrases.3 to MGPP input files
[10253]	1439	my ($key, $text, $word, $isThesaurus, $line);
[3472]	1440	my @fields;
	1441	my $linenumber = 0;
	1442
	1443	open(IN, "<$infile");
	1444	open(DATA, ">$datafile");
	1445	open(WORD, ">$wordfile");
	1446
	1447	# Save the phrases in a separate text file
	1448	if ($savephrases) {
	1449	print $out "Saving phrases in $savephrases\n" if ($verbosity);
	1450	open(SAVE, ">$savephrases");
	1451	}
	1452
	1453	while(<IN>) {
	1454
	1455	# read the line
	1456	chomp;
	1457	$line = $_;
	1458	@fields = split(/:/, $line);
	1459
	1460	# get a phrase number for this line
	1461	$key = shift @fields;
	1462
	1463	# restore the text of the phrase
	1464	$text = shift @fields;
	1465	$text =~ s/s(\d+)/$symbol[$1]/g;
	1466	if ($text =~ / /) {
	1467	$word = "";
	1468	} elsif ($text ne 'untranslated') {
	1469	$word = $text;
	1470	}
	1471
	1472	# output the phrase data
	1473	print DATA "<Document>";
	1474	print DATA "$key:$text:", join(":", @fields), ":\n";
	1475
	1476	# output the word index search data
	1477	print WORD "<Document>$word\n";
	1478
	1479	# output the phrases to a text file
	1480	if ($savephrases) {
	1481	if ((scalar @fields) == 7) {
	1482	$isThesaurus = 1;
	1483	} else {
	1484	$isThesaurus = 0;
	1485	}
	1486	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
	1487	}
	1488	}
	1489	close IN;
	1490	close WORD;
	1491	close DATA;
	1492	close SAVE if ($savephrases);
	1493
	1494	}
	1495
	1496
	1497
	1498	# sort routines used to renumber phrases
	1499
	1500	sub numerically { $a <=> $b }
	1501
	1502	sub by_doc_frequency {
	1503	my $fa = 1;
	1504	if ($a =~ /,/) {
	1505	$fa = $a;
	1506	$fa =~ s/\d+,//;
	1507	}
	1508	my $fb = 1;
	1509	if ($b =~ /,/) {
	1510	$fb = $b;
	1511	$fb =~ s/\d+,//;
	1512	}
	1513
	1514	return ($fb <=> $fa);
	1515	}
	1516
	1517	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: