Context Navigation

source: gsdl/trunk/perllib/classify/Phind.pm@ 20454

Last change on this file since 20454 was 20454, checked in by kjdon, 15 years ago
strip ex from metadata type args
Property svn:keywords set to `Author Date Id Revision`
File size: 40.5 KB

Rev	Line
[3472]	1	###########################################################################
	2	#
	3	# Phind.pm -- the Phind classifier
	4	#
	5	# Copyright (C) 2000 Gordon W. Paynter
	6	# Copyright (C) 2000 New Zealand Digital Library Project
	7	#
	8	#
	9	# A component of the Greenstone digital library software
	10	# from the New Zealand Digital Library Project at the
	11	# University of Waikato, New Zealand.
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# The Phind clasifier plugin.
	30	# Type "classinfo.pl Phind" at the command line for a summary.
	31
[3540]	32	# 12/05/02 Added usage datastructure - John Thompson
	33
[3472]	34	package Phind;
	35
[17209]	36	use BaseClassifier;
[3472]	37	use util;
	38	use ghtml;
	39	use unicode;
	40
[10253]	41	use strict;
	42	no strict 'refs'; # allow filehandles to be variables and viceversa
	43
[3472]	44	my @removedirs = ();
	45
	46	my %wanted_index_files = ('td'=>1,
	47	't'=>1,
	48	'ti'=>1,
	49	'tl'=>1,
	50	'tsd'=>1,
	51	'idb'=>1,
	52	'ib1'=>1,
	53	'ib2'=>1,
	54	'ib3'=>1,
	55	'i'=>1,
	56	'il'=>1,
	57	'w'=>1,
	58	'wa'=>1);
	59
	60	sub BEGIN {
[17209]	61	@Phind::ISA = ('BaseClassifier');
[3472]	62	}
	63
	64	sub END {
	65
	66	# Tidy up stray files - we do this here as there's some weird problem
	67	# preventing us from doing it in the get_classify_info() function (on
	68	# windows at least) where the close() appears to fail on txthandle and
	69	# dochandle, thus preventing us from deleting those files
	70
	71	foreach my $dir (@removedirs) {
	72	if (-d $dir && opendir (DIR, $dir)) {
	73	my @files = readdir DIR;
	74	closedir DIR;
	75
[10253]	76	foreach my $file (@files) {
[3472]	77	next if $file =~ /^\.\.?$/;
	78	my ($suffix) = $file =~ /\.([^\.]+)$/;
	79	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
	80	# delete it!
	81	&util::rm (&util::filename_cat ($dir, $file));
	82	}
	83	}
	84	}
	85	}
	86	}
	87
[4759]	88	my $arguments =
	89	[ { 'name' => "text",
[4873]	90	'desc' => "{Phind.text}",
[3540]	91	'type' => "string",
[4759]	92	'deft' => "section:Title,section:text",
	93	'reqd' => "no" },
	94	{ 'name' => "title",
[4873]	95	'desc' => "{Phind.title}",
[3540]	96	'type' => "metadata",
[4759]	97	'deft' => "Title",
	98	'reqd' => "no" },
[6968]	99	{ 'name' => "buttonname",
	100	'desc' => "{BasClas.buttonname}",
[3540]	101	'type' => "string",
[4759]	102	'deft' => "Phrase",
	103	'reqd' => "no" },
	104	{ 'name' => "language",
[4873]	105	'desc' => "{Phind.language}",
[6989]	106	'type' => "string",
[4759]	107	'deft' => "en",
	108	'reqd' => "no" },
	109	{ 'name' => "savephrases",
[4873]	110	'desc' => "{Phind.savephrases}",
[3540]	111	'type' => "string",
[4759]	112	'deft' => "",
	113	'reqd' => "no" },
	114	{ 'name' => "suffixmode",
[4873]	115	'desc' => "{Phind.suffixmode}",
[3540]	116	'type' => "int",
[4759]	117	'deft' => "1",
[8362]	118	'range' => "0,1",
[4759]	119	'reqd' => "no" },
[8362]	120	{ 'name' => "min_occurs",
	121	'desc' => "{Phind.min_occurs}",
	122	'type' => "int",
	123	'deft' => "2",
	124	'range' => "1,",
	125	'reqd' => "no" },
[4759]	126	{ 'name' => "thesaurus",
[4873]	127	'desc' => "{Phind.thesaurus}",
[3540]	128	'type' => "string",
[4759]	129	'deft' => "",
	130	'reqd' => "no" },
	131	{ 'name' => "untidy",
[4873]	132	'desc' => "{Phind.untidy}",
[3540]	133	'type' => "flag",
[4759]	134	'reqd' => "no" } ];
[3540]	135
[4759]	136	my $options = { 'name' => "Phind",
[5645]	137	'desc' => "{Phind.desc}",
[6408]	138	'abstract' => "no",
	139	'inherits' => "yes",
[4759]	140	'args' => $arguments };
[3540]	141
[3472]	142
	143	# Phrase delimiter symbols - these should be abstracted out someplace
	144
	145	my $colstart = "COLLECTIONSTART";
	146	my $colend = "COLLECTIONEND";
	147	my $doclimit = "DOCUMENTLIMIT";
	148	my $senlimit = "SENTENCELIMIT";
	149	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
	150
	151
	152	# Create a new Phind browser based on collect.cfg
	153
	154	sub new {
[10218]	155	my ($class) = shift (@_);
	156	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
	157	push(@$classifierslist, $class);
[3472]	158
[17209]	159	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	160	push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]	161
[17209]	162	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[6968]	163
[10253]	164	if ($self->{'info_only'}) {
	165	# don't worry about any options etc
	166	return bless $self, $class;
	167	}
	168
[3472]	169	# Ensure the Phind generate scripts are in place
	170	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
	171	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
	172	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
	173	if (!(-e $file1)) {
	174	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
	175	exit(1);
	176	}
	177
[20454]	178	# things that may have ex. in them that need to be stripped off
	179	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
	180	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
	181
[10218]	182	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
[20454]	183
[10218]	184	$self->{"indexes"} = $self->{"text"};
[3472]	185
[10218]	186	# Further setup
	187	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
	188	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
[6982]	189	if (! defined $self->{'builddir'}) {
	190	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
	191	}
[10218]	192	$self->{'total'} = 0;
	193
	194	# Clean out the unused keys
	195	delete $self->{"text"};
[6982]	196
[3472]	197	return bless $self, $class;
	198	}
	199
	200
	201	# Initialise the Phind classifier
	202
	203	sub init {
	204	my $self = shift (@_);
	205
	206	# ensure we have a build directory
	207	my $builddir = $self->{'builddir'};
	208	die unless (-e "$builddir");
	209
	210	# create Phind directory
	211	my $phnumber = 1;
	212	my $phinddir = &util::filename_cat($builddir, "phind1");
	213	while (-e "$phinddir") {
	214	$phnumber++;
	215	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
	216	}
	217	&util::mk_dir("$phinddir");
	218	$self->{'phinddir'} = $phinddir;
	219	$self->{'phindnumber'} = $phnumber;
	220
	221	push(@removedirs, $phinddir) unless $self->{'untidy'};
	222
	223	# open filehandles for documents and text
	224	my $clausefile = &util::filename_cat("$phinddir", "clauses");
	225	&util::rm($clausefile) if (-e $clausefile);
	226
	227	my $txthandle = 'TEXT' . $phnumber;
	228	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
	229	$self->{'txthandle'} = $txthandle;
	230
	231	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
	232	&util::rm($docfile) if (-e $docfile);
	233
	234	my $dochandle = 'DOC' . $phnumber;
	235	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
	236	$self->{'dochandle'} = $dochandle;
	237
	238	}
	239
	240
	241	# Classify each document.
	242	#
	243	# Each document is passed here in turn. The classifier extracts the
	244	# text of each and stores it in the clauses file. Document details are
	245	# stored in the docs.txt file.
	246
	247	sub classify {
	248	my $self = shift (@_);
[18455]	249	my ($doc_obj,$edit_mode) = @_;
[3472]	250
	251	my $verbosity = $self->{'verbosity'};
	252	my $top_section = $doc_obj->get_top_section();
	253
[10218]	254	my $titlefield = $self->{'title'};
[3472]	255
	256	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[3536]	257	if (!defined($title)) {
	258	$title = "";
	259	print STDERR "Phind: document has no title\n";
	260	}
[3472]	261	print "process: $title\n" if ($verbosity > 2);
	262
	263	# Only consider the file if it is in the correct language
	264	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
[10218]	265	my $phrlanguage = $self->{'language'};
[3472]	266	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
[18455]	267
	268	if ($edit_mode eq "delete") {
	269	# This classifier works quite differently to the others
	270	# Probably doesn't support incremental building anyway
	271	return;
	272	}
	273
[3472]	274	# record this file
	275	$self->{'total'} ++;
[10253]	276	# what is $file ???
	277	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[3472]	278
	279	# Store document details
	280	my $OID = $doc_obj->get_OID();
	281	$OID = "NULL" unless defined $OID;
	282	my $dochandle = $self->{'dochandle'};
	283	print $dochandle "<Document>\t$OID\t$title\n";
	284
	285	# Store the text occuring in this object
	286
	287	# output the document delimiter
	288	my $txthandle = $self->{'txthandle'};
	289	print $txthandle "$doclimit\n";
	290
[8362]	291	# iterate over the required indexes and store their text
[3472]	292	my $indexes = $self->{'indexes'};
	293	my $text = "";
	294	my ($part, $level, $field, $section, $data, $dataref);
	295
	296	foreach $part (split(/,/, $indexes)) {
	297
	298	# Each field has a level and a data element ((e.g. document:Title)
	299	($level, $field) = split(/:/, $part);
	300	die unless ($level && $field);
	301
	302	# Extract the text from every section
[3540]	303	# (In phind, document:text and section:text are equivalent)
[3472]	304	if ($field eq "text") {
	305	$data = "";
	306	$section = $doc_obj->get_top_section();
	307	while (defined($section)) {
	308	$data .= $doc_obj->get_text($section) . "\n";
	309	$section = $doc_obj->get_next_section($section);
	310	}
	311	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	312	}
	313
	314	# Extract a metadata field from a document
	315	# (If there is more than one element of the given type, get them all.)
	316	elsif ($level eq "document") {
	317	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
	318	foreach $data (@$dataref) {
	319	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	320	}
	321	}
	322
	323	# Extract metadata from every section in a document
	324	elsif ($level eq "section") {
	325	$data = "";
	326	$section = $doc_obj->get_top_section();
	327	while (defined($section)) {
	328	$dataref = $doc_obj->get_metadata($section, $field);
	329	$data .= join("\n", @$dataref) . "\n";
	330	$section = $doc_obj->get_next_section($section);
	331	}
	332	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	333	}
	334
	335	# Some sort of specification which I don't understand
	336	else {
	337	die "Unknown level ($level) in Phind index ($part)\n";
	338	}
	339
	340	}
	341
	342	# output the text
	343	$text =~ tr/\n//s;
	344	print $txthandle "$text";
	345	}
	346
	347
	348	# Construct the classifier from the information already gathered
	349	#
	350	# When get_classify_info is called, the clauses and docs.txt files have
[3540]	351	# already been constructed in the Phind directory. This function will
[3472]	352	# translate them into compressed, indexed MGPP files that can be read by
	353	# the phindcgi script. It will also register our classifier so that it
	354	# shows up in the navigation bar.
	355
	356	sub get_classify_info {
	357	my $self = shift (@_);
[6408]	358	my ($gli) = @_;
[3472]	359
	360	close $self->{'dochandle'};
	361	close $self->{'txthandle'};
	362	my $verbosity = $self->{'verbosity'};
	363	my $out = $self->{'outhandle'};
	364	my $phinddir = $self->{'phinddir'};
	365
	366	my $osextra = "";
	367	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
	368	$osextra = " -d /";
	369	}
	370
[6408]	371	print STDERR "</Stage>\n" if $gli;
	372
[3472]	373	if ($verbosity) {
	374	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
	375	print $out "*** in ", $self->{'phinddir'}, "\n";
	376	}
	377
[6408]	378	print STDERR "<Stage name='Phind'>\n" if $gli;
	379
[3540]	380	# Construct phind indexes
[3472]	381	my $suffixmode = $self->{'suffixmode'};
[8362]	382	my $min_occurs = $self->{'min_occurs'};
[3472]	383	my ($command, $status);
	384
	385	# Generate the vocabulary, symbol statistics, and numbers file
	386	# from the clauses file
	387	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[6408]	388	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
[3472]	389	&extract_vocabulary($self);
[3540]	390
[3472]	391	# Use the suffix program to generate the phind/phrases file
	392	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[6408]	393	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
[8362]	394	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
[3472]	395
	396	# check that we generated some files. It's not necessarily an error if
	397	# we didn't (execute() would have quit on error), but we can't go on.
	398	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
	399	if (! -r $phrasesfile) {
[6408]	400	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
[3472]	401	print $out "\nNo phrases found for Phind classifier!\n";
	402	return;
[3540]	403	}
[3472]	404
	405	# Create the phrase file and put phrase numbers in phind/phrases
	406	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[6408]	407	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
[3472]	408	&renumber_phrases($self);
	409
	410	print $out "\nCreating phrase databases\n";
[6408]	411	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
[3472]	412	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
	413	my $mg_stem = &util::filename_cat($phinddir, "pdata");
	414
	415	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	416	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	417	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	418
	419	# create the mg index of words
	420	print $out "\nCreating word-level search indexes\n";
[6408]	421	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
[3472]	422	$mg_input = &util::filename_cat($phinddir, "pword.txt");
	423	$mg_stem = &util::filename_cat($phinddir, "pword");
	424
	425	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
	426	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	427	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	428	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
	429	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	430	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	431
	432	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
	433	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
	434	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
	435
	436	# create the mg document information database
	437	print $out "\nCreating document information databases\n";
[6408]	438	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
[3472]	439	$mg_input = &util::filename_cat($phinddir, "docs.txt");
	440	$mg_stem = &util::filename_cat($phinddir, "docs");
	441
	442	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	443	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	444	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	445
	446	# Return the information about the classifier that we'll later want to
	447	# use to create macros when the Phind classifier document is displayed.
	448	my %classifyinfo = ('thistype'=>'Invisible',
	449	'childtype'=>'Phind',
	450	'Title'=>$self->{'buttonname'},
	451	'parameters'=>"phindnumber=$self->{'phindnumber'}",
	452	'contains'=>[]);
	453
	454	my $collection = $self->{'collection'};
	455	my $url = "library?a=p&p=phind&c=$collection";
	456	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
	457
	458	return \%classifyinfo;
	459	}
	460
	461
	462
	463	sub convert_gml_to_tokens {
	464
	465	my ($language_exp, $text) = @_;
	466
	467	# escape any magic words... - jrm21
	468	foreach my $delim (@delimiters) {
	469	my $replacement=lc($delim);
	470	my $num= $text=~ s/$delim/$replacement/g;
	471	if (!$num) {$num=0;}
	472	}
	473
	474	if ($language_exp =~ /en/) {
	475	return &convert_gml_to_tokens_EN($text);
	476	}
[3540]	477
[3472]	478	if ($language_exp =~ /zh/) {
	479	return &convert_gml_to_tokens_ZH($text);
[3540]	480	}
[3472]	481
	482	$_ = $text;
	483
	484	# 1. remove GML tags
	485
	486	# Remove everything that is in a tag
	487	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	488	s/\s<br>\s/ LINEBREAK /isgo;
	489	s/<[^>]*>/ /sgo;
	490
	491	# Now we have the text, but it may contain HTML
	492	# elements coded as > etc. Remove these tags.
	493	s/&/&/sgo;
	494	s/</</sgo;
	495	s/>/>/sgo;
	496	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	497	s/\s<br>\s/ LINEBREAK /isgo;
	498	s/<[^>]*>/ /sgo;
	499
	500	# replace<p> and <br> placeholders with clause break symbol (\n)
	501	s/\s+/ /gso;
	502	s/PARAGRAPHBREAK/\n/sgo;
	503	s/LINEBREAK/\n/sgo;
	504
	505
	506
	507
	508	# 2. Split the remaining text into space-delimited tokens
	509
[3540]	510	# Convert any HTML special characters (like ") to their UTF8 equivalent
	511	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[3472]	512
	513	# Split text at word boundaries
	514	s/\b/ /go;
	515
	516	# 3. Convert the remaining text to "clause format"
	517
	518	# Insert newline if the end of a sentence is detected
	519	# (delimter is: "[\.\?\!]\s")
	520	# s/\s*[\.\?\!]\s+/\n/go;
	521
	522	# remove unnecessary punctuation and replace with clause break symbol (\n)
	523	s/[^\w ]/\n/go;
	524
	525	# remove extraneous whitespace
	526	s/ +/ /sgo;
	527	s/^\s+//mgo;
	528	s/\s*$/\n/mgo;
	529
	530	# remove lines that contain one word or less
	531	s/^\S*$//mgo;
	532	s/^\s*$//mgo;
	533	tr/\n//s;
	534
	535	return $_;
	536	}
	537
	538	# a chinese version
	539	sub convert_gml_to_tokens_ZH {
	540
	541	$_ = shift @_;
	542
	543	# Replace all whitespace with a simple space
	544	s/\s+/ /gs;
	545	# Remove everything that is in a tag
	546	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	547	s/\s<br>\s/ LINEBREAK /isg;
	548	s/<[^>]*>/ /sg;
	549
	550	# Now we have the text, but it may contain HTML
	551	# elements coded as > etc. Remove these tags.
	552	s/</</sg;
	553	s/>/>/sg;
	554
	555	s/\s+/ /sg;
	556	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	557	s/\s<br>\s/ LINEBREAK /isg;
	558	s/<[^>]*>/ /sg;
	559
	560	# remove & and other miscellaneous markup tags
	561	s/&/&/sg;
	562	s/</</sg;
	563	s/>/>/sg;
	564	s/&/&/sg;
	565
	566	# replace<p> and <br> placeholders with carriage returns
	567	s/PARAGRAPHBREAK/\n/sg;
	568	s/LINEBREAK/\n/sg;
	569
	570
	571	# print STDERR "text:$_\n";
	572	return $_;
	573	}
[3540]	574
[3472]	575	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
	576
	577	sub convert_gml_to_tokens_EN {
	578	$_ = shift @_;
	579
	580	# FIRST, remove GML tags
	581
	582	# Replace all whitespace with a simple space
	583	s/\s+/ /gs;
	584
	585	# Remove everything that is in a tag
	586	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	587	s/\s<br>\s/ LINEBREAK /isg;
	588	s/<[^>]*>/ /sg;
	589
	590	# Now we have the text, but it may contain HTML
	591	# elements coded as > etc. Remove these tags.
	592	s/</</sg;
	593	s/>/>/sg;
	594
	595	s/\s+/ /sg;
	596	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	597	s/\s<br>\s/ LINEBREAK /isg;
	598	s/<[^>]*>/ /sg;
	599
	600	# remove & and other miscellaneous markup tags
	601	s/&/&/sg;
	602	s/</</sg;
	603	s/>/>/sg;
	604	s/&/&/sg;
	605
	606	# replace<p> and <br> placeholders with carriage returns
	607	s/PARAGRAPHBREAK/\n/sg;
	608	s/LINEBREAK/\n/sg;
	609
	610
	611	# Exceptional punctuation
	612	#
	613	# We make special cases of some punctuation
	614
	615	# remove any apostrophe that indicates omitted letters
	616	s/(\w+)\'(\w*\s)/ $1$2 /g;
	617
	618	# remove period that appears in a person's initals
	619	s/\s([A-Z])\./ $1 /g;
	620
	621	# replace hyphens in hypheanted words and names with a space
	622	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
	623
	624	# Convert the remaining text to "clause format",
	625	# This means removing all excess punctuation and garbage text,
	626	# normalising valid punctuation to fullstops and commas,
	627	# then putting one cluse on each line.
	628
	629	# Insert newline when the end of a sentence is detected
	630	# (delimter is: "[\.\?\!]\s")
	631	s/\s*[\.\?\!]\s+/\n/g;
	632
	633	# split numbers after four digits
	634	s/(\d\d\d\d)/$1 /g;
	635
	636	# split words after 32 characters
	637
	638	# squash repeated punctuation
	639	tr/A-Za-z0-9 //cs;
	640
	641	# save email addresses
	642	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
	643
	644	# normalise clause breaks (mostly punctuation symbols) to commas
	645	s/[^A-Za-z0-9 \n]+/ , /g;
	646
	647	# Remove repeated commas, and replace with newline
	648	s/\s*,[, ]+/\n/g;
	649
	650	# remove extra whitespace
	651	s/ +/ /sg;
	652	s/^\s+//mg;
	653	s/\s*$/\n/mg;
	654
	655	# remove lines that contain one word or less
	656	s/^\w*$//mg;
	657	s/^\s*$//mg;
	658	tr/\n//s;
	659
	660	return $_;
	661
	662	}
	663
	664
	665
	666	# Execute a system command
	667
	668	sub execute {
	669	my ($command, $verbosity, $outhandle) = @_;
	670	print $outhandle "Executing: $command\n" if ($verbosity > 2);
	671	$! = 0;
	672	my $status = system($command);
	673	if ($status != 0) {
	674	print STDERR "Phind - Error executing '$command': $!\n";
[3540]	675	exit($status); # this causes the build to fail...
[3472]	676	}
	677	}
	678
	679
	680	# Generate the vocabulary, symbol statistics, and numbers file from the
	681	# clauses file. This is legacy code, so is a bit messy and probably wont
	682	# run under windows.
	683
	684	sub extract_vocabulary {
	685	my ($self) = @_;
	686
	687	my $verbosity = $self->{'verbosity'};
	688	my $out = $self->{'outhandle'};
	689
	690	my $collectiondir = $self->{'collectiondir'};
	691	my $phinddir = $self->{'phinddir'};
	692
[10218]	693	my $language_exp = $self->{'language'};
[3472]	694
	695	my ($w, $l, $line, $word);
	696
	697	my ($first_delimiter, $last_delimiter,
	698	$first_stopword, $last_stopword,
	699	$first_extractword, $last_extractword,
	700	$first_contentword, $last_contentword,
	701	$phrasedelimiter);
	702
	703	my $thesaurus = $self->{'thesaurus'};
	704	my ($thesaurus_links, $thesaurus_terms,
	705	%thesaurus, $first_thesaurusword, $last_thesaurusword);
	706
	707	my %symbol;
	708	my (%freq);
	709
	710	print $out "Calculating vocabulary\n" if ($verbosity > 1);
	711
	712	# Read and store the stopwords
	713	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
	714	my $stopword_files = ();
	715	my ($language, $language_dir, $file, $file_name);
	716	my %stopwords;
	717
	718	# Examine each directory in the stopword directory
	719	opendir(STOPDIR, $stopdir);
	720	foreach $language (readdir STOPDIR) {
	721
	722	# Ignore entries that do not match the classifier's language
	723	next unless ($language =~ /$language_exp/);
	724	$language_dir = &util::filename_cat($stopdir, $language);
	725	next unless (-d "$language_dir");
	726
	727	opendir(LANGDIR, $language_dir);
	728	foreach $file (readdir LANGDIR) {
	729
	730	# Ignore entries that are not stopword files
	731	next unless ($file =~ /sw$/);
	732	$file_name = &util::filename_cat($language_dir, $file);
	733	next unless (-f "$file_name");
	734
	735	# Read the stopwords
	736	open(STOPFILE, "<$file_name");
	737	while (<STOPFILE>) {
	738	s/^\s+//;
	739	s/\s.*//;
	740	$word = $_;
	741	$l = lc($word);
	742	$stopwords{$l} = $word;
	743	}
	744	close STOPFILE;
	745
	746	}
	747	closedir LANGDIR;
	748	}
	749	closedir STOPDIR;
	750
	751	# Read thesaurus information
	752	if ($thesaurus) {
	753
	754	# link file exists
	755	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
	756	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
	757
	758	# ensure term file exists in the correct language
	759	if ($language_exp =~ /^([a-z][a-z])/) {
	760	$language = $1;
	761	} else {
	762	$language = 'en';
	763	}
	764	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
	765	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
	766
	767
	768	# Read the thesaurus terms
	769	open(TH, "<$thesaurus_terms");
	770	while(<TH>) {
	771	s/^\d+ //;
	772	s/\(.*\)//;
	773	foreach $w (split(/\s+/, $_)) {
	774	$thesaurus{lc($w)} = $w;
	775	}
	776	}
	777	close TH;
	778	}
	779
	780	# Read words in the text and count occurences
	781	open(TXT, "<$phinddir/clauses");
	782
	783	my @words;
	784	while(<TXT>) {
	785	$line = $_;
	786	next unless ($line =~ /./);
	787
	788	@words = split(/\s+/, $line);
	789	foreach $w (@words) {
	790	$l = lc($w);
	791	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
	792	$freq{$w}++;
	793	}
	794	$freq{$senlimit}++;
	795	}
	796
	797	close TXT;
	798
	799	# Calculate the "best" form of each word
	800	my (%bestform, %totalfreq, %bestfreq);
	801
	802	foreach $w (sort (keys %freq)) {
	803	$l = lc($w);
	804
	805	# totalfreq is the number of times a term appears in any form
	806	$totalfreq{$l} += $freq{$w};
	807
	808	if (defined $stopwords{$l}) {
	809	$bestform{$l} = $stopwords{$l};
	810
	811	} elsif (defined $thesaurus{$l}) {
	812	$bestform{$l} = $thesaurus{$l};
	813
	814	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
	815	$bestfreq{$l} = $freq{$w};
	816	$bestform{$l} = $w;
	817	}
	818	}
	819	undef %freq;
	820	undef %bestfreq;
	821
	822
	823	# Assign symbol numbers to tokens
	824	my $nextsymbol = 1;
	825	my (@vocab);
	826
	827	# Delimiters
	828	$first_delimiter = 1;
	829
	830	foreach $word (@delimiters) {
	831
	832	# $word = lc($word); # jrm21
	833	$word = uc($word);
	834	$bestform{$word} = $word;
	835	$vocab[$nextsymbol] = $word;
	836	$symbol{$word} = $nextsymbol;
	837	$nextsymbol++;
	838	}
	839	$last_delimiter = $nextsymbol - 1;
	840	# Stopwords
	841	$first_stopword = $nextsymbol;
	842
	843	foreach my $word (sort keys %stopwords) {
	844	# don't include stopword unless it occurs in the text
	845	$word = lc($word);
	846	next unless ($totalfreq{$word});
	847	next if ($symbol{$word});
	848
	849	$vocab[$nextsymbol] = $word;
	850	$symbol{$word} = $nextsymbol;
	851	$nextsymbol++;
	852	}
	853	$last_stopword = $nextsymbol - 1;
	854	$first_contentword = $nextsymbol;
	855
	856	# Thesaurus terms
	857	if ($thesaurus) {
	858	$first_thesaurusword = $nextsymbol;
	859
	860	foreach my $word (sort keys %thesaurus) {
	861
	862	$word = lc($word);
	863	next if ($symbol{$word});
	864	$bestform{$word} = $thesaurus{$word};
	865
	866	$vocab[$nextsymbol] = $word;
	867	$symbol{$word} = $nextsymbol;
	868	$nextsymbol++;
	869
	870	}
	871	$last_thesaurusword = $nextsymbol - 1;
	872	}
	873
	874	# Other content words
	875	$first_extractword = $nextsymbol;
	876
	877	foreach my $word (sort (keys %bestform)) {
	878
	879	next if ($symbol{$word});
	880
	881	$vocab[$nextsymbol] = $word;
	882	$symbol{$word} = $nextsymbol;
	883	$nextsymbol++;
	884	}
	885	$last_extractword = $nextsymbol - 1;
	886	$last_contentword = $nextsymbol - 1;
	887
	888	# Outut the words
	889	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
	890	open(VOC, ">$phinddir/clauses.vocab");
	891
	892	for (my $i = 1; $i < $nextsymbol; $i++) {
	893	$w = $vocab[$i];
	894
	895	print VOC "$bestform{$w}\n";
	896	$totalfreq{$w} = 0 unless ($totalfreq{$w});
	897	}
	898	close VOC;
	899
	900
	901	# Create statistics file
	902	# Output statistics about the vocablary
	903	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
	904	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
	905
	906	open(STAT, ">$phinddir/clauses.stats")
	907	\|\| die "Cannot open $phinddir/clauses.stats: $!";
	908
	909	print STAT "first_delimiter $first_delimiter\n";
	910	print STAT "last_delimiter $last_delimiter\n";
	911	print STAT "first_stopword $first_stopword\n";
	912	print STAT "last_stopword $last_stopword\n";
	913	if ($thesaurus) {
	914	print STAT "first_thesaurusword $first_thesaurusword\n";
	915	print STAT "last_thesaurusword $last_thesaurusword\n";
	916	}
	917	print STAT "first_extractword $first_extractword\n";
	918	print STAT "last_extractword $last_extractword\n";
	919	print STAT "first_contentword $first_contentword\n";
	920	print STAT "last_contentword $last_contentword\n";
	921	print STAT "first_symbol $first_delimiter\n";
	922	print STAT "last_symbol $last_contentword\n";
	923	print STAT "first_word $first_stopword\n";
	924	print STAT "last_word $last_contentword\n";
	925	close STAT;
	926
	927	undef @vocab;
	928
	929
	930	# Create numbers file
	931	# Save text as symbol numbers
	932	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
	933
	934	open(TXT, "<$phinddir/clauses");
	935	open(NUM, ">$phinddir/clauses.numbers");
	936
	937	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
	938	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
	939	$phrasedelimiter = $symbol{$senlimit};
	940	print NUM "$symbol{$colstart}\n";
	941
	942	# set up the special symbols that delimit documents and sentences
	943	while(<TXT>) {
	944
	945	# split sentence into a list of tokens
	946	$line = $_;
	947	next unless ($line =~ /./);
	948	@words = split(/\s+/, $line);
	949
	950	# output one token at a time
	951	foreach $word (@words) {
	952	# don't lower-case special delimiters - jrm21
	953	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
	954	$word = lc($word);
	955	}
	956	print NUM "$symbol{$word}\n";
	957	}
	958
	959	# output phrase delimiter
	960	print NUM "$phrasedelimiter\n";
	961	}
	962
	963	close TXT;
	964	# print NUM "$symbol{lc($colend)}\n";# jrm21
	965	print NUM "$symbol{$colend}\n";
	966	close NUM;
	967
	968	# Save thesaurus data in one convienient file
	969	if ($thesaurus) {
	970
	971	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	972
	973
	974	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
	975	if ($verbosity > 1);
	976
	977	# Read the thesaurus terms
	978	my ($num, $text, %thes_symbols);
	979
	980	open(TH, "<$thesaurus_terms");
	981	while(<TH>) {
	982	chomp;
	983	@words = split(/\s+/, $_);
	984	$num = shift @words;
	985	$text = "";
	986
	987	# translate words into symbol numbers
	988	foreach $word (@words) {
	989	$word = lc($word);
	990	if ($symbol{$word}) {
	991	$text .= "s$symbol{$word} ";
	992	} elsif ($verbosity) {
	993	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
	994	}
	995	}
	996	$text =~ s/ $//;
	997	$thes_symbols{$num} = $text;
	998	}
	999	close TH;
	1000
	1001	# Read the thesaurus links and write the corresponding data
	1002	open(TH, "<$thesaurus_links");
	1003	open(THOUT, ">$thesaurusfile");
	1004
	1005	while(<TH>) {
	1006	chomp;
	1007	($num, $text) = split(/:/, $_);
	1008
	1009	if (defined($thes_symbols{$num})) {
	1010	print THOUT "$num:$thes_symbols{$num}:$text\n";
	1011	} else {
	1012	print THOUT "$num:untranslated:$text\n";
	1013	}
	1014	}
	1015	close TH;
	1016	close THOUT;
	1017	}
	1018
	1019
	1020
	1021
	1022	}
	1023
	1024
	1025	# renumber_phrases
	1026	#
	1027	# Prepare the phrases file to be input to mgpp. The biggest problem is
	1028	# reconciling the phrase identifiers used by the suffix program (which
	1029	# we'll call suffix-id numbers) with the numbers used in the thesaurus
	1030	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
	1031	# Phind-id numbers must be sorted by frequency of occurance.
	1032	#
	1033	# Start creating a set of phind-id numbers from the sorted suffix-id
	1034	# numbers and (if required) the thesaurus-id numbers. Then add any other
	1035	# phrases occuring in the thesaurus.
	1036	#
	1037	# The last thing we have to do is restore the vocabulary information to the
	1038	# phrase file so that the phrases are stored as words, not as symbol
	1039	# numbers.
	1040
	1041	# The original phrases file looks something like this:
	1042	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
	1043	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
	1044	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
	1045	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
	1046
	1047
	1048	sub renumber_phrases {
	1049	my ($self) = @_;
	1050
	1051	renumber_suffix_data($self);
	1052	renumber_thesaurus_data($self);
	1053	restore_vocabulary_data($self);
	1054
	1055	}
	1056
	1057
	1058
	1059	# renumber_suffix_data
	1060	#
	1061	# Translate phrases file to phrases.2 using phind keys instead
	1062	# of suffix keys and sorting the expansion data.
	1063
	1064	sub renumber_suffix_data {
	1065	my ($self) = @_;
	1066
	1067	my $verbosity = $self->{'verbosity'};
	1068	my $out = $self->{'outhandle'};
	1069	print $out "Translate phrases: suffix-ids become phind-id's\n"
	1070	if ($verbosity);
	1071
	1072	my $phinddir = $self->{'phinddir'};
	1073	my $infile = &util::filename_cat($phinddir, 'phrases');
	1074	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
	1075
	1076	# Read the phrase file. Calculate initial set of phind-id
	1077	# numbers and store (suffixid -> frequency) relation.
	1078
	1079	my %suffixtophind;
[10253]	1080	my @totalfrequency;
[3472]	1081	my (@fields, $suffixid);
	1082	my $nextphind = 1;
	1083
	1084	open(IN, "<$infile");
	1085	while(<IN>) {
	1086
	1087	chomp;
	1088	@fields = split(/:/, $_);
	1089
	1090	# get next suffixid and phindid
	1091	$suffixid = shift @fields;
	1092	$suffixtophind{$suffixid} = $nextphind;
	1093
	1094	# store total frequency
	1095	shift @fields;
	1096	$totalfrequency[$nextphind] = shift @fields;
	1097
	1098	$nextphind++;
	1099	}
	1100	close IN;
	1101
	1102
	1103	# Translate phrases file to phrases.2. Use phind keys (not suffix
	1104	# keys), sort expansion and document occurance data in order of
	1105	# descending frequency..
	1106	open(IN, "<$infile");
	1107	open(OUT, ">$outfile");
	1108
	1109	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
	1110	my (@documwents, @newexp, $k, $n);
	1111	my $linenumber = 0;
	1112
	1113	while(<IN>) {
	1114
	1115	# read the line
	1116	chomp;
	1117	@fields = split(/:/, $_);
	1118
	1119	# get a phrase number for this line
	1120	$suffixid = shift @fields;
	1121	die unless (defined($suffixtophind{$suffixid}));
	1122	$phindid = $suffixtophind{$suffixid};
	1123
	1124	# get the symbols in the phrase
	1125	$text = shift @fields;
	1126
	1127	# output status information
	1128	$linenumber++;
	1129	if ($verbosity > 2) {
	1130	if ($linenumber % 1000 == 0) {
	1131	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
[10253]	1132	}
	1133	# what are $num and $key??
	1134	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[3472]	1135	}
	1136
	1137	# get the phrase frequency
	1138	$tf = shift @fields;
	1139
	1140	# get the number of expansions
	1141	$countexp = shift @fields;
	1142
	1143	# get the expansions, convert them into phind-id numbers, and sort them
	1144	$expansions = shift @fields;
	1145	@newexp = ();
	1146	foreach $k (split(/,/, $expansions)) {
	1147	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
	1148	$n = $suffixtophind{$k};
	1149	push @newexp, $n;
	1150	}
	1151	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
	1152
	1153	# get the number of documents
	1154	$countdocs = shift @fields;
	1155
	1156	# get the documents and sort them
	1157	$documents = shift @fields;
	1158	$documents =~ s/d//g;
[10253]	1159	my @documents = split(/;/, $documents);
[3472]	1160	@documents = sort by_doc_frequency @documents;
	1161
	1162	# output the phrase data
	1163	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
	1164	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
	1165
	1166	}
	1167
	1168	close IN;
	1169	close OUT;
	1170	}
	1171
	1172
	1173	# renumber_thesaurus_data
	1174	#
	1175	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
	1176
	1177	sub renumber_thesaurus_data {
	1178	my ($self) = @_;
	1179
	1180	my $out = $self->{'outhandle'};
	1181	my $verbosity = $self->{'verbosity'};
	1182	my $thesaurus = $self->{'thesaurus'};
	1183
	1184	my $phinddir = $self->{'phinddir'};
	1185	my $infile = &util::filename_cat($phinddir, "phrases.2");
	1186	my $outfile = &util::filename_cat($phinddir, "phrases.3");
	1187
	1188
	1189	# If no thesaurus is defined, simply move the phrases file.
	1190	if (!$thesaurus) {
	1191	print $out "Translate phrases.2: no thesaurus data\n"
	1192	if ($verbosity);
	1193	&util::mv($infile, $outfile);
	1194	return;
	1195	}
	1196
	1197	print $out "Translate phrases.2: add thesaurus data\n"
	1198	if ($verbosity);
	1199
	1200	# 1.
	1201	# Read thesaurus file and store (symbols->thesaurusid) mapping
	1202	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	1203	my %symbolstothesid;
	1204	my (@fields, $thesid, $symbols);
	1205
	1206	open(TH, "<$thesaurusfile");
	1207
	1208	while (<TH>) {
	1209
	1210	chomp;
	1211	@fields = split(/:/, $_);
	1212
	1213	# get id and text
	1214	$thesid = shift @fields;
	1215	$symbols = shift @fields;
	1216	$symbolstothesid{$symbols} = $thesid;
	1217	}
	1218	close TH;
	1219
	1220	# 2.
	1221	# Read phrases file to find thesaurus entries that already
	1222	# have a phindid. Store their phind-ids for later translation,
	1223	# and store their frequency for later sorting.
	1224	my %thesaurustophindid;
	1225	my %phindidtofrequency;
	1226	my ($phindid, $freq);
	1227
	1228	open(IN, "<$infile");
	1229
	1230	while(<IN>) {
	1231
	1232	chomp;
	1233	@fields = split(/:/, $_);
	1234
	1235	# phindid and symbols for this line
	1236	$phindid = shift @fields;
	1237	$symbols = shift @fields;
	1238	$freq = shift @fields;
	1239
	1240	# do we have a thesaurus id corresponding to this phrase?
	1241	if (defined($symbolstothesid{$symbols})) {
	1242	$thesid = $symbolstothesid{$symbols};
	1243	$thesaurustophindid{$thesid} = $phindid;
	1244	$phindidtofrequency{$phindid} = $freq;
	1245	}
	1246	}
	1247	close IN;
	1248
	1249	undef %symbolstothesid;
	1250
	1251	# 3.
	1252	# Create phind-id numbers for remaining thesaurus entries,
	1253	# and note that their frequency is 0 for later sorting.
	1254	my $nextphindid = $phindid + 1;
	1255
	1256	open(TH, "<$thesaurusfile");
	1257	while(<TH>) {
	1258
	1259	chomp;
	1260	@fields = split(/:/, $_);
	1261
	1262	# read thesaurus-id and ensure it has a corresponding phind-id
	1263	$thesid = shift @fields;
	1264	if (!defined($thesaurustophindid{$thesid})) {
	1265	$thesaurustophindid{$thesid} = $nextphindid;
	1266	$phindidtofrequency{$nextphindid} = 0;
	1267	$nextphindid++;
	1268	}
	1269	}
	1270	close TH;
	1271
	1272	# 4.
	1273	# Translate thesaurus file, replacing thesaurus-id numbers with
	1274	# phind-id numbers.
	1275	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
	1276	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
	1277	my (@links, $linkid, %linkidtotype, $newrelation);
	1278
	1279	open(TH, "<$thesaurusfile");
	1280	open(TO, ">$newthesaurusfile");
	1281	while(<TH>) {
	1282
	1283	chomp;
	1284	@fields = split(/:/, $_);
	1285
	1286	# phindid and symbols for this line
	1287	($thesid, $symbols, $relations) = @fields;
	1288
	1289	die unless ($thesid && $symbols);
	1290	die unless $thesaurustophindid{$thesid};
	1291	$phindid = $thesaurustophindid{$thesid};
	1292
	1293	# convert each part of the relation string to use phind-id numbers
	1294	# at the same time, we want to sort the list by frequency.
	1295	undef %linkidtotype;
	1296
	1297	foreach $linktext (split(/;/, $relations)) {
	1298	@linkdata = split(/,/, $linktext);
	1299
	1300	# remember the linktype (e.g. BT, NT)
	1301	$linktype = shift @linkdata;
	1302
	1303	# store the type of each link
	1304	foreach $thesid (@linkdata) {
	1305	die unless (defined($thesaurustophindid{$thesid}));
	1306	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
	1307	}
	1308	}
	1309
	1310	# sort the list of links, first by frequency, then by type.
	1311	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
	1312	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
	1313	$linkcounter = (scalar @links);
	1314
	1315	# create a string describing the link information
	1316	$linktype = $linkidtotype{$links[0]};
	1317	$newrelation = $linktype;
	1318	foreach $linkid (@links) {
	1319	if ($linkidtotype{$linkid} ne $linktype) {
	1320	$linktype = $linkidtotype{$linkid};
	1321	$newrelation .= ";" . $linktype;
	1322	}
	1323	$newrelation .= "," . $linkid;
	1324	}
	1325	$newrelation .= ";";
	1326
	1327
	1328	# output the new line
	1329	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
	1330	}
	1331	close TH;
	1332	close TO;
	1333
	1334	undef %thesaurustophindid;
	1335	undef %linkidtotype;
	1336	undef %phindidtofrequency;
	1337
	1338	# 5.
	1339	# Read thesaurus data (in phind-id format) into memory
	1340	my %thesaurusdata;
	1341
	1342	open(TH, "<$newthesaurusfile");
	1343	while(<TH>) {
	1344	chomp;
	1345	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
	1346	die unless ($phindid && $symbols);
	1347	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
	1348	}
	1349	close TH;
	1350
	1351	# 6.
	1352	# Add thesaurus data to phrases file
	1353	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
	1354	my (@documwents, @newexp, $k, $n);
	1355	my $linenumber = 0;
	1356
	1357	open(IN, "<$infile");
	1358	open(OUT, ">$outfile");
	1359
	1360	# Update existing phrases
	1361	while(<IN>) {
	1362
	1363	chomp;
	1364	@fields = split(/:/, $_);
	1365
	1366	# get data for this line
	1367	$phindid = shift @fields;
	1368
	1369	# output the phrase data, with thesaurus information
	1370	print OUT "$phindid:", join(":", @fields);
	1371
	1372	# add thesaurus data
	1373	if (defined($thesaurusdata{$phindid})) {
	1374	@fields = split(/:/, $thesaurusdata{$phindid});
	1375	shift @fields;
	1376	$linkcounter = shift @fields;
	1377	$relations = shift @fields;
	1378
	1379	print OUT ":$linkcounter:$relations";
	1380	$thesaurusdata{$phindid} = "";
	1381	}
	1382	print OUT "\n";
	1383	}
	1384	close IN;
	1385
	1386	# Add phrases that aren't already in the file
	1387	foreach $phindid (sort numerically keys %thesaurusdata) {
	1388	next unless ($thesaurusdata{$phindid});
	1389
	1390	@fields = split(/:/, $thesaurusdata{$phindid});
	1391	$symbols = shift @fields;
	1392	$linkcounter = shift @fields;
	1393	$relations = shift @fields;
	1394
	1395	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
	1396	}
	1397	close OUT;
	1398
	1399	}
	1400
	1401	# restore_vocabulary_data
	1402	#
	1403	# Read phrases.3 and restore vocabulary information. Then write
[8362]	1404	# this data to the MGPP input files (pword.txt and pdata.txt) and
[3472]	1405	# (if requested) to the saved phrases file.
	1406
	1407	sub restore_vocabulary_data {
	1408	my ($self) = @_;
	1409
	1410	my $out = $self->{'outhandle'};
	1411	my $verbosity = $self->{'verbosity'};
	1412	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
	1413
	1414	my $phinddir = $self->{'phinddir'};
	1415	my $infile = &util::filename_cat($phinddir, 'phrases.3');
	1416	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
	1417	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
	1418	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
	1419
	1420	my $savephrases = $self->{'savephrases'};
	1421
	1422	# 1.
	1423	# Read the vocabulary file
	1424	open(V, "<$vocabfile")
	1425	\|\| die "Cannot open $vocabfile: $!";
	1426	my @symbol;
	1427	my $i = 1;
	1428	while(<V>) {
	1429	chomp;
	1430	$symbol[$i++] = $_;
	1431	}
	1432	close V;
	1433
	1434	# 2.
	1435	# Translate phrases.3 to MGPP input files
[10253]	1436	my ($key, $text, $word, $isThesaurus, $line);
[3472]	1437	my @fields;
	1438	my $linenumber = 0;
	1439
	1440	open(IN, "<$infile");
	1441	open(DATA, ">$datafile");
	1442	open(WORD, ">$wordfile");
	1443
	1444	# Save the phrases in a separate text file
	1445	if ($savephrases) {
	1446	print $out "Saving phrases in $savephrases\n" if ($verbosity);
	1447	open(SAVE, ">$savephrases");
	1448	}
	1449
	1450	while(<IN>) {
	1451
	1452	# read the line
	1453	chomp;
	1454	$line = $_;
	1455	@fields = split(/:/, $line);
	1456
	1457	# get a phrase number for this line
	1458	$key = shift @fields;
	1459
	1460	# restore the text of the phrase
	1461	$text = shift @fields;
	1462	$text =~ s/s(\d+)/$symbol[$1]/g;
	1463	if ($text =~ / /) {
	1464	$word = "";
	1465	} elsif ($text ne 'untranslated') {
	1466	$word = $text;
	1467	}
	1468
	1469	# output the phrase data
	1470	print DATA "<Document>";
	1471	print DATA "$key:$text:", join(":", @fields), ":\n";
	1472
	1473	# output the word index search data
	1474	print WORD "<Document>$word\n";
	1475
	1476	# output the phrases to a text file
	1477	if ($savephrases) {
	1478	if ((scalar @fields) == 7) {
	1479	$isThesaurus = 1;
	1480	} else {
	1481	$isThesaurus = 0;
	1482	}
	1483	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
	1484	}
	1485	}
	1486	close IN;
	1487	close WORD;
	1488	close DATA;
	1489	close SAVE if ($savephrases);
	1490
	1491	}
	1492
	1493
	1494
	1495	# sort routines used to renumber phrases
	1496
	1497	sub numerically { $a <=> $b }
	1498
	1499	sub by_doc_frequency {
	1500	my $fa = 1;
	1501	if ($a =~ /,/) {
	1502	$fa = $a;
	1503	$fa =~ s/\d+,//;
	1504	}
	1505	my $fb = 1;
	1506	if ($b =~ /,/) {
	1507	$fb = $b;
	1508	$fb =~ s/\d+,//;
	1509	}
	1510
	1511	return ($fb <=> $fa);
	1512	}
	1513
	1514	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: