Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 21876

Last change on this file since 21876 was 21876, checked in by kjdon, 14 years ago
only process into english clauses if english is the only language, not for eg with ar\|en. Don't remove all non \w characters - this removes all non alphanumeric chars. I have made up a punctuation match, some replaced with new lines, some with space.
Property svn:keywords set to `Author Date Id Revision`
File size: 40.8 KB

Rev	Line
[3472]	1	###########################################################################
	2	#
	3	# Phind.pm -- the Phind classifier
	4	#
	5	# Copyright (C) 2000 Gordon W. Paynter
	6	# Copyright (C) 2000 New Zealand Digital Library Project
	7	#
	8	#
	9	# A component of the Greenstone digital library software
	10	# from the New Zealand Digital Library Project at the
	11	# University of Waikato, New Zealand.
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# The Phind clasifier plugin.
	30	# Type "classinfo.pl Phind" at the command line for a summary.
	31
	32	package Phind;
	33
[17209]	34	use BaseClassifier;
[3472]	35	use util;
	36	use ghtml;
	37	use unicode;
	38
[10253]	39	use strict;
	40	no strict 'refs'; # allow filehandles to be variables and viceversa
	41
[3472]	42	my @removedirs = ();
	43
	44	my %wanted_index_files = ('td'=>1,
	45	't'=>1,
	46	'ti'=>1,
	47	'tl'=>1,
	48	'tsd'=>1,
	49	'idb'=>1,
	50	'ib1'=>1,
	51	'ib2'=>1,
	52	'ib3'=>1,
	53	'i'=>1,
	54	'il'=>1,
	55	'w'=>1,
	56	'wa'=>1);
	57
	58	sub BEGIN {
[17209]	59	@Phind::ISA = ('BaseClassifier');
[3472]	60	}
	61
	62	sub END {
	63
	64	# Tidy up stray files - we do this here as there's some weird problem
	65	# preventing us from doing it in the get_classify_info() function (on
	66	# windows at least) where the close() appears to fail on txthandle and
	67	# dochandle, thus preventing us from deleting those files
	68
	69	foreach my $dir (@removedirs) {
	70	if (-d $dir && opendir (DIR, $dir)) {
	71	my @files = readdir DIR;
	72	closedir DIR;
	73
[10253]	74	foreach my $file (@files) {
[3472]	75	next if $file =~ /^\.\.?$/;
	76	my ($suffix) = $file =~ /\.([^\.]+)$/;
	77	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
	78	# delete it!
	79	&util::rm (&util::filename_cat ($dir, $file));
	80	}
	81	}
	82	}
	83	}
	84	}
	85
[4759]	86	my $arguments =
	87	[ { 'name' => "text",
[4873]	88	'desc' => "{Phind.text}",
[3540]	89	'type' => "string",
[4759]	90	'deft' => "section:Title,section:text",
	91	'reqd' => "no" },
	92	{ 'name' => "title",
[4873]	93	'desc' => "{Phind.title}",
[3540]	94	'type' => "metadata",
[4759]	95	'deft' => "Title",
	96	'reqd' => "no" },
[6968]	97	{ 'name' => "buttonname",
	98	'desc' => "{BasClas.buttonname}",
[3540]	99	'type' => "string",
[4759]	100	'deft' => "Phrase",
	101	'reqd' => "no" },
	102	{ 'name' => "language",
[4873]	103	'desc' => "{Phind.language}",
[6989]	104	'type' => "string",
[4759]	105	'deft' => "en",
	106	'reqd' => "no" },
	107	{ 'name' => "savephrases",
[4873]	108	'desc' => "{Phind.savephrases}",
[3540]	109	'type' => "string",
[4759]	110	'deft' => "",
	111	'reqd' => "no" },
	112	{ 'name' => "suffixmode",
[4873]	113	'desc' => "{Phind.suffixmode}",
[3540]	114	'type' => "int",
[4759]	115	'deft' => "1",
[8362]	116	'range' => "0,1",
[4759]	117	'reqd' => "no" },
[8362]	118	{ 'name' => "min_occurs",
	119	'desc' => "{Phind.min_occurs}",
	120	'type' => "int",
	121	'deft' => "2",
	122	'range' => "1,",
	123	'reqd' => "no" },
[4759]	124	{ 'name' => "thesaurus",
[4873]	125	'desc' => "{Phind.thesaurus}",
[3540]	126	'type' => "string",
[4759]	127	'deft' => "",
	128	'reqd' => "no" },
	129	{ 'name' => "untidy",
[4873]	130	'desc' => "{Phind.untidy}",
[3540]	131	'type' => "flag",
[4759]	132	'reqd' => "no" } ];
[3540]	133
[4759]	134	my $options = { 'name' => "Phind",
[5645]	135	'desc' => "{Phind.desc}",
[6408]	136	'abstract' => "no",
	137	'inherits' => "yes",
[4759]	138	'args' => $arguments };
[3540]	139
[3472]	140
	141	# Phrase delimiter symbols - these should be abstracted out someplace
	142
	143	my $colstart = "COLLECTIONSTART";
	144	my $colend = "COLLECTIONEND";
	145	my $doclimit = "DOCUMENTLIMIT";
	146	my $senlimit = "SENTENCELIMIT";
	147	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
	148
	149
	150	# Create a new Phind browser based on collect.cfg
	151
	152	sub new {
[10218]	153	my ($class) = shift (@_);
	154	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
	155	push(@$classifierslist, $class);
[3472]	156
[17209]	157	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	158	push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]	159
[17209]	160	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[6968]	161
[10253]	162	if ($self->{'info_only'}) {
	163	# don't worry about any options etc
	164	return bless $self, $class;
	165	}
	166
[3472]	167	# Ensure the Phind generate scripts are in place
	168	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
	169	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
	170	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
	171	if (!(-e $file1)) {
	172	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
	173	exit(1);
	174	}
	175
[20454]	176	# things that may have ex. in them that need to be stripped off
	177	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
	178	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
	179
[10218]	180	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
[20454]	181
[10218]	182	$self->{"indexes"} = $self->{"text"};
[3472]	183
[10218]	184	# Further setup
	185	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
	186	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
[6982]	187	if (! defined $self->{'builddir'}) {
	188	$self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
	189	}
[10218]	190	$self->{'total'} = 0;
	191
	192	# Clean out the unused keys
	193	delete $self->{"text"};
[6982]	194
[3472]	195	return bless $self, $class;
	196	}
	197
	198
	199	# Initialise the Phind classifier
	200
	201	sub init {
	202	my $self = shift (@_);
	203
	204	# ensure we have a build directory
	205	my $builddir = $self->{'builddir'};
	206	die unless (-e "$builddir");
	207
	208	# create Phind directory
	209	my $phnumber = 1;
	210	my $phinddir = &util::filename_cat($builddir, "phind1");
	211	while (-e "$phinddir") {
	212	$phnumber++;
	213	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
	214	}
	215	&util::mk_dir("$phinddir");
	216	$self->{'phinddir'} = $phinddir;
	217	$self->{'phindnumber'} = $phnumber;
	218
	219	push(@removedirs, $phinddir) unless $self->{'untidy'};
	220
	221	# open filehandles for documents and text
	222	my $clausefile = &util::filename_cat("$phinddir", "clauses");
	223	&util::rm($clausefile) if (-e $clausefile);
	224
	225	my $txthandle = 'TEXT' . $phnumber;
	226	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
	227	$self->{'txthandle'} = $txthandle;
	228
	229	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
	230	&util::rm($docfile) if (-e $docfile);
	231
	232	my $dochandle = 'DOC' . $phnumber;
	233	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
	234	$self->{'dochandle'} = $dochandle;
	235
	236	}
	237
	238
	239	# Classify each document.
	240	#
	241	# Each document is passed here in turn. The classifier extracts the
	242	# text of each and stores it in the clauses file. Document details are
	243	# stored in the docs.txt file.
	244
	245	sub classify {
	246	my $self = shift (@_);
[18455]	247	my ($doc_obj,$edit_mode) = @_;
[3472]	248
	249	my $verbosity = $self->{'verbosity'};
	250	my $top_section = $doc_obj->get_top_section();
	251
[10218]	252	my $titlefield = $self->{'title'};
[3472]	253
	254	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[3536]	255	if (!defined($title)) {
	256	$title = "";
	257	print STDERR "Phind: document has no title\n";
	258	}
[3472]	259	print "process: $title\n" if ($verbosity > 2);
	260
	261	# Only consider the file if it is in the correct language
	262	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
[10218]	263	my $phrlanguage = $self->{'language'};
[3472]	264	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
[18455]	265
	266	if ($edit_mode eq "delete") {
	267	# This classifier works quite differently to the others
	268	# Probably doesn't support incremental building anyway
	269	return;
	270	}
	271
[3472]	272	# record this file
	273	$self->{'total'} ++;
[10253]	274	# what is $file ???
	275	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[3472]	276
	277	# Store document details
	278	my $OID = $doc_obj->get_OID();
	279	$OID = "NULL" unless defined $OID;
	280	my $dochandle = $self->{'dochandle'};
	281	print $dochandle "<Document>\t$OID\t$title\n";
	282
	283	# Store the text occuring in this object
	284
	285	# output the document delimiter
	286	my $txthandle = $self->{'txthandle'};
	287	print $txthandle "$doclimit\n";
	288
[8362]	289	# iterate over the required indexes and store their text
[3472]	290	my $indexes = $self->{'indexes'};
	291	my $text = "";
	292	my ($part, $level, $field, $section, $data, $dataref);
	293
	294	foreach $part (split(/,/, $indexes)) {
	295
	296	# Each field has a level and a data element ((e.g. document:Title)
	297	($level, $field) = split(/:/, $part);
	298	die unless ($level && $field);
	299
	300	# Extract the text from every section
[3540]	301	# (In phind, document:text and section:text are equivalent)
[3472]	302	if ($field eq "text") {
	303	$data = "";
	304	$section = $doc_obj->get_top_section();
	305	while (defined($section)) {
	306	$data .= $doc_obj->get_text($section) . "\n";
	307	$section = $doc_obj->get_next_section($section);
	308	}
	309	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	310	}
	311
	312	# Extract a metadata field from a document
	313	# (If there is more than one element of the given type, get them all.)
	314	elsif ($level eq "document") {
	315	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
	316	foreach $data (@$dataref) {
	317	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	318	}
	319	}
	320
	321	# Extract metadata from every section in a document
	322	elsif ($level eq "section") {
	323	$data = "";
	324	$section = $doc_obj->get_top_section();
	325	while (defined($section)) {
	326	$dataref = $doc_obj->get_metadata($section, $field);
	327	$data .= join("\n", @$dataref) . "\n";
	328	$section = $doc_obj->get_next_section($section);
	329	}
	330	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	331	}
	332
	333	# Some sort of specification which I don't understand
	334	else {
	335	die "Unknown level ($level) in Phind index ($part)\n";
	336	}
	337
	338	}
	339
	340	# output the text
	341	$text =~ tr/\n//s;
	342	print $txthandle "$text";
	343	}
	344
	345
	346	# Construct the classifier from the information already gathered
	347	#
	348	# When get_classify_info is called, the clauses and docs.txt files have
[3540]	349	# already been constructed in the Phind directory. This function will
[3472]	350	# translate them into compressed, indexed MGPP files that can be read by
	351	# the phindcgi script. It will also register our classifier so that it
	352	# shows up in the navigation bar.
	353
	354	sub get_classify_info {
	355	my $self = shift (@_);
[6408]	356	my ($gli) = @_;
[3472]	357
	358	close $self->{'dochandle'};
	359	close $self->{'txthandle'};
	360	my $verbosity = $self->{'verbosity'};
	361	my $out = $self->{'outhandle'};
	362	my $phinddir = $self->{'phinddir'};
	363
	364	my $osextra = "";
	365	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
	366	$osextra = " -d /";
	367	}
	368
[6408]	369	print STDERR "</Stage>\n" if $gli;
	370
[3472]	371	if ($verbosity) {
	372	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
	373	print $out "*** in ", $self->{'phinddir'}, "\n";
	374	}
	375
[6408]	376	print STDERR "<Stage name='Phind'>\n" if $gli;
	377
[3540]	378	# Construct phind indexes
[3472]	379	my $suffixmode = $self->{'suffixmode'};
[8362]	380	my $min_occurs = $self->{'min_occurs'};
[3472]	381	my ($command, $status);
	382
	383	# Generate the vocabulary, symbol statistics, and numbers file
	384	# from the clauses file
	385	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[6408]	386	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
[3472]	387	&extract_vocabulary($self);
[3540]	388
[3472]	389	# Use the suffix program to generate the phind/phrases file
	390	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[6408]	391	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
[8362]	392	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
[3472]	393
	394	# check that we generated some files. It's not necessarily an error if
	395	# we didn't (execute() would have quit on error), but we can't go on.
	396	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
	397	if (! -r $phrasesfile) {
[6408]	398	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
[3472]	399	print $out "\nNo phrases found for Phind classifier!\n";
	400	return;
[3540]	401	}
[3472]	402
	403	# Create the phrase file and put phrase numbers in phind/phrases
	404	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[6408]	405	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
[3472]	406	&renumber_phrases($self);
	407
	408	print $out "\nCreating phrase databases\n";
[6408]	409	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
[3472]	410	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
	411	my $mg_stem = &util::filename_cat($phinddir, "pdata");
	412
	413	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	414	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	415	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	416
	417	# create the mg index of words
	418	print $out "\nCreating word-level search indexes\n";
[6408]	419	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
[3472]	420	$mg_input = &util::filename_cat($phinddir, "pword.txt");
	421	$mg_stem = &util::filename_cat($phinddir, "pword");
	422
	423	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
	424	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	425	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	426	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
	427	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	428	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	429
	430	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
	431	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
	432	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
	433
	434	# create the mg document information database
	435	print $out "\nCreating document information databases\n";
[6408]	436	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
[3472]	437	$mg_input = &util::filename_cat($phinddir, "docs.txt");
	438	$mg_stem = &util::filename_cat($phinddir, "docs");
	439
	440	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	441	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	442	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	443
	444	# Return the information about the classifier that we'll later want to
	445	# use to create macros when the Phind classifier document is displayed.
	446	my %classifyinfo = ('thistype'=>'Invisible',
	447	'childtype'=>'Phind',
	448	'Title'=>$self->{'buttonname'},
	449	'parameters'=>"phindnumber=$self->{'phindnumber'}",
	450	'contains'=>[]);
	451
	452	my $collection = $self->{'collection'};
	453	my $url = "library?a=p&p=phind&c=$collection";
	454	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
	455
	456	return \%classifyinfo;
	457	}
	458
	459
	460
	461	sub convert_gml_to_tokens {
	462
	463	my ($language_exp, $text) = @_;
	464
	465	# escape any magic words... - jrm21
	466	foreach my $delim (@delimiters) {
	467	my $replacement=lc($delim);
	468	my $num= $text=~ s/$delim/$replacement/g;
	469	if (!$num) {$num=0;}
	470	}
	471
[21876]	472	if ($language_exp =~ /^en$/) {
[3472]	473	return &convert_gml_to_tokens_EN($text);
	474	}
[3540]	475
[3472]	476	if ($language_exp =~ /zh/) {
	477	return &convert_gml_to_tokens_ZH($text);
[3540]	478	}
[3472]	479
	480	$_ = $text;
	481
	482	# 1. remove GML tags
	483
	484	# Remove everything that is in a tag
	485	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	486	s/\s<br>\s/ LINEBREAK /isgo;
	487	s/<[^>]*>/ /sgo;
	488
	489	# Now we have the text, but it may contain HTML
	490	# elements coded as > etc. Remove these tags.
	491	s/&/&/sgo;
	492	s/</</sgo;
	493	s/>/>/sgo;
	494	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	495	s/\s<br>\s/ LINEBREAK /isgo;
	496	s/<[^>]*>/ /sgo;
	497
	498	# replace<p> and <br> placeholders with clause break symbol (\n)
	499	s/\s+/ /gso;
	500	s/PARAGRAPHBREAK/\n/sgo;
	501	s/LINEBREAK/\n/sgo;
	502
	503
	504	# 2. Split the remaining text into space-delimited tokens
	505
[3540]	506	# Convert any HTML special characters (like ") to their UTF8 equivalent
	507	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[3472]	508
	509	# Split text at word boundaries
	510	s/\b/ /go;
[21876]	511
[3472]	512	# 3. Convert the remaining text to "clause format"
	513
	514	# Insert newline if the end of a sentence is detected
	515	# (delimter is: "[\.\?\!]\s")
	516	# s/\s*[\.\?\!]\s+/\n/go;
	517
	518	# remove unnecessary punctuation and replace with clause break symbol (\n)
[21876]	519	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
	520	#s/[^\w ]/\n/go;
	521	# replace punct with new lines - is this what we want??
	522	s/\s[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s/\n/go; #"
	523	# then remove other punct with space
	524	s/[\'\`\\\_]/ /go;
[3472]	525
	526	# remove extraneous whitespace
	527	s/ +/ /sgo;
	528	s/^\s+//mgo;
	529	s/\s*$/\n/mgo;
	530
	531	# remove lines that contain one word or less
	532	s/^\S*$//mgo;
	533	s/^\s*$//mgo;
	534	tr/\n//s;
	535
	536	return $_;
	537	}
	538
	539	# a chinese version
	540	sub convert_gml_to_tokens_ZH {
	541
	542	$_ = shift @_;
	543
	544	# Replace all whitespace with a simple space
	545	s/\s+/ /gs;
	546	# Remove everything that is in a tag
	547	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	548	s/\s<br>\s/ LINEBREAK /isg;
	549	s/<[^>]*>/ /sg;
	550
	551	# Now we have the text, but it may contain HTML
	552	# elements coded as > etc. Remove these tags.
	553	s/</</sg;
	554	s/>/>/sg;
	555
	556	s/\s+/ /sg;
	557	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	558	s/\s<br>\s/ LINEBREAK /isg;
	559	s/<[^>]*>/ /sg;
	560
	561	# remove & and other miscellaneous markup tags
	562	s/&/&/sg;
	563	s/</</sg;
	564	s/>/>/sg;
	565	s/&/&/sg;
	566
	567	# replace<p> and <br> placeholders with carriage returns
	568	s/PARAGRAPHBREAK/\n/sg;
	569	s/LINEBREAK/\n/sg;
	570
	571
	572	# print STDERR "text:$_\n";
	573	return $_;
	574	}
[3540]	575
[3472]	576	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
	577
	578	sub convert_gml_to_tokens_EN {
	579	$_ = shift @_;
	580
	581	# FIRST, remove GML tags
	582
	583	# Replace all whitespace with a simple space
	584	s/\s+/ /gs;
	585
	586	# Remove everything that is in a tag
	587	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	588	s/\s<br>\s/ LINEBREAK /isg;
	589	s/<[^>]*>/ /sg;
	590
	591	# Now we have the text, but it may contain HTML
	592	# elements coded as > etc. Remove these tags.
	593	s/</</sg;
	594	s/>/>/sg;
	595
	596	s/\s+/ /sg;
	597	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	598	s/\s<br>\s/ LINEBREAK /isg;
	599	s/<[^>]*>/ /sg;
	600
	601	# remove & and other miscellaneous markup tags
	602	s/&/&/sg;
	603	s/</</sg;
	604	s/>/>/sg;
	605	s/&/&/sg;
	606
	607	# replace<p> and <br> placeholders with carriage returns
	608	s/PARAGRAPHBREAK/\n/sg;
	609	s/LINEBREAK/\n/sg;
	610
	611
	612	# Exceptional punctuation
	613	#
	614	# We make special cases of some punctuation
	615
	616	# remove any apostrophe that indicates omitted letters
	617	s/(\w+)\'(\w*\s)/ $1$2 /g;
	618
	619	# remove period that appears in a person's initals
	620	s/\s([A-Z])\./ $1 /g;
	621
	622	# replace hyphens in hypheanted words and names with a space
	623	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
	624
	625	# Convert the remaining text to "clause format",
	626	# This means removing all excess punctuation and garbage text,
	627	# normalising valid punctuation to fullstops and commas,
	628	# then putting one cluse on each line.
	629
	630	# Insert newline when the end of a sentence is detected
	631	# (delimter is: "[\.\?\!]\s")
	632	s/\s*[\.\?\!]\s+/\n/g;
	633
	634	# split numbers after four digits
	635	s/(\d\d\d\d)/$1 /g;
	636
	637	# split words after 32 characters
	638
	639	# squash repeated punctuation
	640	tr/A-Za-z0-9 //cs;
	641
	642	# save email addresses
	643	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
	644
	645	# normalise clause breaks (mostly punctuation symbols) to commas
	646	s/[^A-Za-z0-9 \n]+/ , /g;
	647
	648	# Remove repeated commas, and replace with newline
	649	s/\s*,[, ]+/\n/g;
	650
	651	# remove extra whitespace
	652	s/ +/ /sg;
	653	s/^\s+//mg;
	654	s/\s*$/\n/mg;
	655
	656	# remove lines that contain one word or less
	657	s/^\w*$//mg;
	658	s/^\s*$//mg;
	659	tr/\n//s;
	660
	661	return $_;
	662
	663	}
	664
	665
	666
	667	# Execute a system command
	668
	669	sub execute {
	670	my ($command, $verbosity, $outhandle) = @_;
	671	print $outhandle "Executing: $command\n" if ($verbosity > 2);
	672	$! = 0;
	673	my $status = system($command);
	674	if ($status != 0) {
	675	print STDERR "Phind - Error executing '$command': $!\n";
[3540]	676	exit($status); # this causes the build to fail...
[3472]	677	}
	678	}
	679
	680
	681	# Generate the vocabulary, symbol statistics, and numbers file from the
	682	# clauses file. This is legacy code, so is a bit messy and probably wont
	683	# run under windows.
	684
	685	sub extract_vocabulary {
	686	my ($self) = @_;
	687
	688	my $verbosity = $self->{'verbosity'};
	689	my $out = $self->{'outhandle'};
	690
	691	my $collectiondir = $self->{'collectiondir'};
	692	my $phinddir = $self->{'phinddir'};
	693
[10218]	694	my $language_exp = $self->{'language'};
[3472]	695
	696	my ($w, $l, $line, $word);
	697
	698	my ($first_delimiter, $last_delimiter,
	699	$first_stopword, $last_stopword,
	700	$first_extractword, $last_extractword,
	701	$first_contentword, $last_contentword,
	702	$phrasedelimiter);
	703
	704	my $thesaurus = $self->{'thesaurus'};
	705	my ($thesaurus_links, $thesaurus_terms,
	706	%thesaurus, $first_thesaurusword, $last_thesaurusword);
	707
	708	my %symbol;
	709	my (%freq);
	710
	711	print $out "Calculating vocabulary\n" if ($verbosity > 1);
	712
	713	# Read and store the stopwords
	714	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
	715	my $stopword_files = ();
	716	my ($language, $language_dir, $file, $file_name);
	717	my %stopwords;
	718
	719	# Examine each directory in the stopword directory
	720	opendir(STOPDIR, $stopdir);
	721	foreach $language (readdir STOPDIR) {
	722
	723	# Ignore entries that do not match the classifier's language
	724	next unless ($language =~ /$language_exp/);
	725	$language_dir = &util::filename_cat($stopdir, $language);
	726	next unless (-d "$language_dir");
	727
	728	opendir(LANGDIR, $language_dir);
	729	foreach $file (readdir LANGDIR) {
	730
	731	# Ignore entries that are not stopword files
	732	next unless ($file =~ /sw$/);
	733	$file_name = &util::filename_cat($language_dir, $file);
	734	next unless (-f "$file_name");
	735
	736	# Read the stopwords
	737	open(STOPFILE, "<$file_name");
	738	while (<STOPFILE>) {
	739	s/^\s+//;
	740	s/\s.*//;
	741	$word = $_;
	742	$l = lc($word);
	743	$stopwords{$l} = $word;
	744	}
	745	close STOPFILE;
	746
	747	}
	748	closedir LANGDIR;
	749	}
	750	closedir STOPDIR;
	751
	752	# Read thesaurus information
	753	if ($thesaurus) {
	754
	755	# link file exists
	756	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
	757	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
	758
	759	# ensure term file exists in the correct language
	760	if ($language_exp =~ /^([a-z][a-z])/) {
	761	$language = $1;
	762	} else {
	763	$language = 'en';
	764	}
	765	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
	766	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
	767
	768
	769	# Read the thesaurus terms
	770	open(TH, "<$thesaurus_terms");
	771	while(<TH>) {
	772	s/^\d+ //;
	773	s/\(.*\)//;
	774	foreach $w (split(/\s+/, $_)) {
	775	$thesaurus{lc($w)} = $w;
	776	}
	777	}
	778	close TH;
	779	}
	780
	781	# Read words in the text and count occurences
	782	open(TXT, "<$phinddir/clauses");
	783
	784	my @words;
	785	while(<TXT>) {
	786	$line = $_;
	787	next unless ($line =~ /./);
	788
	789	@words = split(/\s+/, $line);
	790	foreach $w (@words) {
	791	$l = lc($w);
	792	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
	793	$freq{$w}++;
	794	}
	795	$freq{$senlimit}++;
	796	}
	797
	798	close TXT;
	799
	800	# Calculate the "best" form of each word
	801	my (%bestform, %totalfreq, %bestfreq);
	802
	803	foreach $w (sort (keys %freq)) {
	804	$l = lc($w);
	805
	806	# totalfreq is the number of times a term appears in any form
	807	$totalfreq{$l} += $freq{$w};
	808
	809	if (defined $stopwords{$l}) {
	810	$bestform{$l} = $stopwords{$l};
	811
	812	} elsif (defined $thesaurus{$l}) {
	813	$bestform{$l} = $thesaurus{$l};
	814
	815	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
	816	$bestfreq{$l} = $freq{$w};
	817	$bestform{$l} = $w;
	818	}
	819	}
	820	undef %freq;
	821	undef %bestfreq;
	822
	823
	824	# Assign symbol numbers to tokens
	825	my $nextsymbol = 1;
	826	my (@vocab);
	827
	828	# Delimiters
	829	$first_delimiter = 1;
	830
	831	foreach $word (@delimiters) {
	832
	833	# $word = lc($word); # jrm21
	834	$word = uc($word);
	835	$bestform{$word} = $word;
	836	$vocab[$nextsymbol] = $word;
	837	$symbol{$word} = $nextsymbol;
	838	$nextsymbol++;
	839	}
	840	$last_delimiter = $nextsymbol - 1;
	841	# Stopwords
	842	$first_stopword = $nextsymbol;
	843
	844	foreach my $word (sort keys %stopwords) {
	845	# don't include stopword unless it occurs in the text
	846	$word = lc($word);
	847	next unless ($totalfreq{$word});
	848	next if ($symbol{$word});
	849
	850	$vocab[$nextsymbol] = $word;
	851	$symbol{$word} = $nextsymbol;
	852	$nextsymbol++;
	853	}
	854	$last_stopword = $nextsymbol - 1;
	855	$first_contentword = $nextsymbol;
	856
	857	# Thesaurus terms
	858	if ($thesaurus) {
	859	$first_thesaurusword = $nextsymbol;
	860
	861	foreach my $word (sort keys %thesaurus) {
	862
	863	$word = lc($word);
	864	next if ($symbol{$word});
	865	$bestform{$word} = $thesaurus{$word};
	866
	867	$vocab[$nextsymbol] = $word;
	868	$symbol{$word} = $nextsymbol;
	869	$nextsymbol++;
	870
	871	}
	872	$last_thesaurusword = $nextsymbol - 1;
	873	}
	874
	875	# Other content words
	876	$first_extractword = $nextsymbol;
	877
	878	foreach my $word (sort (keys %bestform)) {
	879
	880	next if ($symbol{$word});
	881
	882	$vocab[$nextsymbol] = $word;
	883	$symbol{$word} = $nextsymbol;
	884	$nextsymbol++;
	885	}
	886	$last_extractword = $nextsymbol - 1;
	887	$last_contentword = $nextsymbol - 1;
	888
	889	# Outut the words
	890	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
	891	open(VOC, ">$phinddir/clauses.vocab");
	892
	893	for (my $i = 1; $i < $nextsymbol; $i++) {
	894	$w = $vocab[$i];
	895
	896	print VOC "$bestform{$w}\n";
	897	$totalfreq{$w} = 0 unless ($totalfreq{$w});
	898	}
	899	close VOC;
	900
	901
	902	# Create statistics file
	903	# Output statistics about the vocablary
	904	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
	905	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
	906
	907	open(STAT, ">$phinddir/clauses.stats")
	908	\|\| die "Cannot open $phinddir/clauses.stats: $!";
	909
	910	print STAT "first_delimiter $first_delimiter\n";
	911	print STAT "last_delimiter $last_delimiter\n";
	912	print STAT "first_stopword $first_stopword\n";
	913	print STAT "last_stopword $last_stopword\n";
	914	if ($thesaurus) {
	915	print STAT "first_thesaurusword $first_thesaurusword\n";
	916	print STAT "last_thesaurusword $last_thesaurusword\n";
	917	}
	918	print STAT "first_extractword $first_extractword\n";
	919	print STAT "last_extractword $last_extractword\n";
	920	print STAT "first_contentword $first_contentword\n";
	921	print STAT "last_contentword $last_contentword\n";
	922	print STAT "first_symbol $first_delimiter\n";
	923	print STAT "last_symbol $last_contentword\n";
	924	print STAT "first_word $first_stopword\n";
	925	print STAT "last_word $last_contentword\n";
	926	close STAT;
	927
	928	undef @vocab;
	929
	930
	931	# Create numbers file
	932	# Save text as symbol numbers
	933	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
	934
	935	open(TXT, "<$phinddir/clauses");
	936	open(NUM, ">$phinddir/clauses.numbers");
	937
	938	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
	939	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
	940	$phrasedelimiter = $symbol{$senlimit};
	941	print NUM "$symbol{$colstart}\n";
	942
	943	# set up the special symbols that delimit documents and sentences
	944	while(<TXT>) {
	945
	946	# split sentence into a list of tokens
	947	$line = $_;
	948	next unless ($line =~ /./);
	949	@words = split(/\s+/, $line);
	950
	951	# output one token at a time
	952	foreach $word (@words) {
	953	# don't lower-case special delimiters - jrm21
	954	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
	955	$word = lc($word);
	956	}
	957	print NUM "$symbol{$word}\n";
	958	}
	959
	960	# output phrase delimiter
	961	print NUM "$phrasedelimiter\n";
	962	}
	963
	964	close TXT;
	965	# print NUM "$symbol{lc($colend)}\n";# jrm21
	966	print NUM "$symbol{$colend}\n";
	967	close NUM;
	968
	969	# Save thesaurus data in one convienient file
	970	if ($thesaurus) {
	971
	972	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	973
	974
	975	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
	976	if ($verbosity > 1);
	977
	978	# Read the thesaurus terms
	979	my ($num, $text, %thes_symbols);
	980
	981	open(TH, "<$thesaurus_terms");
	982	while(<TH>) {
	983	chomp;
	984	@words = split(/\s+/, $_);
	985	$num = shift @words;
	986	$text = "";
	987
	988	# translate words into symbol numbers
	989	foreach $word (@words) {
	990	$word = lc($word);
	991	if ($symbol{$word}) {
	992	$text .= "s$symbol{$word} ";
	993	} elsif ($verbosity) {
	994	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
	995	}
	996	}
	997	$text =~ s/ $//;
	998	$thes_symbols{$num} = $text;
	999	}
	1000	close TH;
	1001
	1002	# Read the thesaurus links and write the corresponding data
	1003	open(TH, "<$thesaurus_links");
	1004	open(THOUT, ">$thesaurusfile");
	1005
	1006	while(<TH>) {
	1007	chomp;
	1008	($num, $text) = split(/:/, $_);
	1009
	1010	if (defined($thes_symbols{$num})) {
	1011	print THOUT "$num:$thes_symbols{$num}:$text\n";
	1012	} else {
	1013	print THOUT "$num:untranslated:$text\n";
	1014	}
	1015	}
	1016	close TH;
	1017	close THOUT;
	1018	}
	1019
	1020
	1021
	1022
	1023	}
	1024
	1025
	1026	# renumber_phrases
	1027	#
	1028	# Prepare the phrases file to be input to mgpp. The biggest problem is
	1029	# reconciling the phrase identifiers used by the suffix program (which
	1030	# we'll call suffix-id numbers) with the numbers used in the thesaurus
	1031	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
	1032	# Phind-id numbers must be sorted by frequency of occurance.
	1033	#
	1034	# Start creating a set of phind-id numbers from the sorted suffix-id
	1035	# numbers and (if required) the thesaurus-id numbers. Then add any other
	1036	# phrases occuring in the thesaurus.
	1037	#
	1038	# The last thing we have to do is restore the vocabulary information to the
	1039	# phrase file so that the phrases are stored as words, not as symbol
	1040	# numbers.
	1041
	1042	# The original phrases file looks something like this:
	1043	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
	1044	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
	1045	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
	1046	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
	1047
	1048
	1049	sub renumber_phrases {
	1050	my ($self) = @_;
	1051
	1052	renumber_suffix_data($self);
	1053	renumber_thesaurus_data($self);
	1054	restore_vocabulary_data($self);
	1055
	1056	}
	1057
	1058
	1059
	1060	# renumber_suffix_data
	1061	#
	1062	# Translate phrases file to phrases.2 using phind keys instead
	1063	# of suffix keys and sorting the expansion data.
	1064
	1065	sub renumber_suffix_data {
	1066	my ($self) = @_;
	1067
	1068	my $verbosity = $self->{'verbosity'};
	1069	my $out = $self->{'outhandle'};
	1070	print $out "Translate phrases: suffix-ids become phind-id's\n"
	1071	if ($verbosity);
	1072
	1073	my $phinddir = $self->{'phinddir'};
	1074	my $infile = &util::filename_cat($phinddir, 'phrases');
	1075	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
	1076
	1077	# Read the phrase file. Calculate initial set of phind-id
	1078	# numbers and store (suffixid -> frequency) relation.
	1079
	1080	my %suffixtophind;
[10253]	1081	my @totalfrequency;
[3472]	1082	my (@fields, $suffixid);
	1083	my $nextphind = 1;
	1084
	1085	open(IN, "<$infile");
	1086	while(<IN>) {
	1087
	1088	chomp;
	1089	@fields = split(/:/, $_);
	1090
	1091	# get next suffixid and phindid
	1092	$suffixid = shift @fields;
	1093	$suffixtophind{$suffixid} = $nextphind;
	1094
	1095	# store total frequency
	1096	shift @fields;
	1097	$totalfrequency[$nextphind] = shift @fields;
	1098
	1099	$nextphind++;
	1100	}
	1101	close IN;
	1102
	1103
	1104	# Translate phrases file to phrases.2. Use phind keys (not suffix
	1105	# keys), sort expansion and document occurance data in order of
	1106	# descending frequency..
	1107	open(IN, "<$infile");
	1108	open(OUT, ">$outfile");
	1109
	1110	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
	1111	my (@documwents, @newexp, $k, $n);
	1112	my $linenumber = 0;
	1113
	1114	while(<IN>) {
	1115
	1116	# read the line
	1117	chomp;
	1118	@fields = split(/:/, $_);
	1119
	1120	# get a phrase number for this line
	1121	$suffixid = shift @fields;
	1122	die unless (defined($suffixtophind{$suffixid}));
	1123	$phindid = $suffixtophind{$suffixid};
	1124
	1125	# get the symbols in the phrase
	1126	$text = shift @fields;
	1127
	1128	# output status information
	1129	$linenumber++;
	1130	if ($verbosity > 2) {
	1131	if ($linenumber % 1000 == 0) {
	1132	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
[10253]	1133	}
	1134	# what are $num and $key??
	1135	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[3472]	1136	}
	1137
	1138	# get the phrase frequency
	1139	$tf = shift @fields;
	1140
	1141	# get the number of expansions
	1142	$countexp = shift @fields;
	1143
	1144	# get the expansions, convert them into phind-id numbers, and sort them
	1145	$expansions = shift @fields;
	1146	@newexp = ();
	1147	foreach $k (split(/,/, $expansions)) {
	1148	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
	1149	$n = $suffixtophind{$k};
	1150	push @newexp, $n;
	1151	}
	1152	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
	1153
	1154	# get the number of documents
	1155	$countdocs = shift @fields;
	1156
	1157	# get the documents and sort them
	1158	$documents = shift @fields;
	1159	$documents =~ s/d//g;
[10253]	1160	my @documents = split(/;/, $documents);
[3472]	1161	@documents = sort by_doc_frequency @documents;
	1162
	1163	# output the phrase data
	1164	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
	1165	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
	1166
	1167	}
	1168
	1169	close IN;
	1170	close OUT;
	1171	}
	1172
	1173
	1174	# renumber_thesaurus_data
	1175	#
	1176	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
	1177
	1178	sub renumber_thesaurus_data {
	1179	my ($self) = @_;
	1180
	1181	my $out = $self->{'outhandle'};
	1182	my $verbosity = $self->{'verbosity'};
	1183	my $thesaurus = $self->{'thesaurus'};
	1184
	1185	my $phinddir = $self->{'phinddir'};
	1186	my $infile = &util::filename_cat($phinddir, "phrases.2");
	1187	my $outfile = &util::filename_cat($phinddir, "phrases.3");
	1188
	1189
	1190	# If no thesaurus is defined, simply move the phrases file.
	1191	if (!$thesaurus) {
	1192	print $out "Translate phrases.2: no thesaurus data\n"
	1193	if ($verbosity);
	1194	&util::mv($infile, $outfile);
	1195	return;
	1196	}
	1197
	1198	print $out "Translate phrases.2: add thesaurus data\n"
	1199	if ($verbosity);
	1200
	1201	# 1.
	1202	# Read thesaurus file and store (symbols->thesaurusid) mapping
	1203	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
	1204	my %symbolstothesid;
	1205	my (@fields, $thesid, $symbols);
	1206
	1207	open(TH, "<$thesaurusfile");
	1208
	1209	while (<TH>) {
	1210
	1211	chomp;
	1212	@fields = split(/:/, $_);
	1213
	1214	# get id and text
	1215	$thesid = shift @fields;
	1216	$symbols = shift @fields;
	1217	$symbolstothesid{$symbols} = $thesid;
	1218	}
	1219	close TH;
	1220
	1221	# 2.
	1222	# Read phrases file to find thesaurus entries that already
	1223	# have a phindid. Store their phind-ids for later translation,
	1224	# and store their frequency for later sorting.
	1225	my %thesaurustophindid;
	1226	my %phindidtofrequency;
	1227	my ($phindid, $freq);
	1228
	1229	open(IN, "<$infile");
	1230
	1231	while(<IN>) {
	1232
	1233	chomp;
	1234	@fields = split(/:/, $_);
	1235
	1236	# phindid and symbols for this line
	1237	$phindid = shift @fields;
	1238	$symbols = shift @fields;
	1239	$freq = shift @fields;
	1240
	1241	# do we have a thesaurus id corresponding to this phrase?
	1242	if (defined($symbolstothesid{$symbols})) {
	1243	$thesid = $symbolstothesid{$symbols};
	1244	$thesaurustophindid{$thesid} = $phindid;
	1245	$phindidtofrequency{$phindid} = $freq;
	1246	}
	1247	}
	1248	close IN;
	1249
	1250	undef %symbolstothesid;
	1251
	1252	# 3.
	1253	# Create phind-id numbers for remaining thesaurus entries,
	1254	# and note that their frequency is 0 for later sorting.
	1255	my $nextphindid = $phindid + 1;
	1256
	1257	open(TH, "<$thesaurusfile");
	1258	while(<TH>) {
	1259
	1260	chomp;
	1261	@fields = split(/:/, $_);
	1262
	1263	# read thesaurus-id and ensure it has a corresponding phind-id
	1264	$thesid = shift @fields;
	1265	if (!defined($thesaurustophindid{$thesid})) {
	1266	$thesaurustophindid{$thesid} = $nextphindid;
	1267	$phindidtofrequency{$nextphindid} = 0;
	1268	$nextphindid++;
	1269	}
	1270	}
	1271	close TH;
	1272
	1273	# 4.
	1274	# Translate thesaurus file, replacing thesaurus-id numbers with
	1275	# phind-id numbers.
	1276	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
	1277	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
	1278	my (@links, $linkid, %linkidtotype, $newrelation);
	1279
	1280	open(TH, "<$thesaurusfile");
	1281	open(TO, ">$newthesaurusfile");
	1282	while(<TH>) {
	1283
	1284	chomp;
	1285	@fields = split(/:/, $_);
	1286
	1287	# phindid and symbols for this line
	1288	($thesid, $symbols, $relations) = @fields;
	1289
	1290	die unless ($thesid && $symbols);
	1291	die unless $thesaurustophindid{$thesid};
	1292	$phindid = $thesaurustophindid{$thesid};
	1293
	1294	# convert each part of the relation string to use phind-id numbers
	1295	# at the same time, we want to sort the list by frequency.
	1296	undef %linkidtotype;
	1297
	1298	foreach $linktext (split(/;/, $relations)) {
	1299	@linkdata = split(/,/, $linktext);
	1300
	1301	# remember the linktype (e.g. BT, NT)
	1302	$linktype = shift @linkdata;
	1303
	1304	# store the type of each link
	1305	foreach $thesid (@linkdata) {
	1306	die unless (defined($thesaurustophindid{$thesid}));
	1307	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
	1308	}
	1309	}
	1310
	1311	# sort the list of links, first by frequency, then by type.
	1312	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
	1313	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
	1314	$linkcounter = (scalar @links);
	1315
	1316	# create a string describing the link information
	1317	$linktype = $linkidtotype{$links[0]};
	1318	$newrelation = $linktype;
	1319	foreach $linkid (@links) {
	1320	if ($linkidtotype{$linkid} ne $linktype) {
	1321	$linktype = $linkidtotype{$linkid};
	1322	$newrelation .= ";" . $linktype;
	1323	}
	1324	$newrelation .= "," . $linkid;
	1325	}
	1326	$newrelation .= ";";
	1327
	1328
	1329	# output the new line
	1330	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
	1331	}
	1332	close TH;
	1333	close TO;
	1334
	1335	undef %thesaurustophindid;
	1336	undef %linkidtotype;
	1337	undef %phindidtofrequency;
	1338
	1339	# 5.
	1340	# Read thesaurus data (in phind-id format) into memory
	1341	my %thesaurusdata;
	1342
	1343	open(TH, "<$newthesaurusfile");
	1344	while(<TH>) {
	1345	chomp;
	1346	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
	1347	die unless ($phindid && $symbols);
	1348	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
	1349	}
	1350	close TH;
	1351
	1352	# 6.
	1353	# Add thesaurus data to phrases file
	1354	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
	1355	my (@documwents, @newexp, $k, $n);
	1356	my $linenumber = 0;
	1357
	1358	open(IN, "<$infile");
	1359	open(OUT, ">$outfile");
	1360
	1361	# Update existing phrases
	1362	while(<IN>) {
	1363
	1364	chomp;
	1365	@fields = split(/:/, $_);
	1366
	1367	# get data for this line
	1368	$phindid = shift @fields;
	1369
	1370	# output the phrase data, with thesaurus information
	1371	print OUT "$phindid:", join(":", @fields);
	1372
	1373	# add thesaurus data
	1374	if (defined($thesaurusdata{$phindid})) {
	1375	@fields = split(/:/, $thesaurusdata{$phindid});
	1376	shift @fields;
	1377	$linkcounter = shift @fields;
	1378	$relations = shift @fields;
	1379
	1380	print OUT ":$linkcounter:$relations";
	1381	$thesaurusdata{$phindid} = "";
	1382	}
	1383	print OUT "\n";
	1384	}
	1385	close IN;
	1386
	1387	# Add phrases that aren't already in the file
	1388	foreach $phindid (sort numerically keys %thesaurusdata) {
	1389	next unless ($thesaurusdata{$phindid});
	1390
	1391	@fields = split(/:/, $thesaurusdata{$phindid});
	1392	$symbols = shift @fields;
	1393	$linkcounter = shift @fields;
	1394	$relations = shift @fields;
	1395
	1396	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
	1397	}
	1398	close OUT;
	1399
	1400	}
	1401
	1402	# restore_vocabulary_data
	1403	#
	1404	# Read phrases.3 and restore vocabulary information. Then write
[8362]	1405	# this data to the MGPP input files (pword.txt and pdata.txt) and
[3472]	1406	# (if requested) to the saved phrases file.
	1407
	1408	sub restore_vocabulary_data {
	1409	my ($self) = @_;
	1410
	1411	my $out = $self->{'outhandle'};
	1412	my $verbosity = $self->{'verbosity'};
	1413	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
	1414
	1415	my $phinddir = $self->{'phinddir'};
	1416	my $infile = &util::filename_cat($phinddir, 'phrases.3');
	1417	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
	1418	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
	1419	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
	1420
	1421	my $savephrases = $self->{'savephrases'};
	1422
	1423	# 1.
	1424	# Read the vocabulary file
	1425	open(V, "<$vocabfile")
	1426	\|\| die "Cannot open $vocabfile: $!";
	1427	my @symbol;
	1428	my $i = 1;
	1429	while(<V>) {
	1430	chomp;
	1431	$symbol[$i++] = $_;
	1432	}
	1433	close V;
	1434
	1435	# 2.
	1436	# Translate phrases.3 to MGPP input files
[10253]	1437	my ($key, $text, $word, $isThesaurus, $line);
[3472]	1438	my @fields;
	1439	my $linenumber = 0;
	1440
	1441	open(IN, "<$infile");
	1442	open(DATA, ">$datafile");
	1443	open(WORD, ">$wordfile");
	1444
	1445	# Save the phrases in a separate text file
	1446	if ($savephrases) {
	1447	print $out "Saving phrases in $savephrases\n" if ($verbosity);
	1448	open(SAVE, ">$savephrases");
	1449	}
	1450
	1451	while(<IN>) {
	1452
	1453	# read the line
	1454	chomp;
	1455	$line = $_;
	1456	@fields = split(/:/, $line);
	1457
	1458	# get a phrase number for this line
	1459	$key = shift @fields;
	1460
	1461	# restore the text of the phrase
	1462	$text = shift @fields;
	1463	$text =~ s/s(\d+)/$symbol[$1]/g;
	1464	if ($text =~ / /) {
	1465	$word = "";
	1466	} elsif ($text ne 'untranslated') {
	1467	$word = $text;
	1468	}
	1469
	1470	# output the phrase data
	1471	print DATA "<Document>";
	1472	print DATA "$key:$text:", join(":", @fields), ":\n";
	1473
	1474	# output the word index search data
	1475	print WORD "<Document>$word\n";
	1476
	1477	# output the phrases to a text file
	1478	if ($savephrases) {
	1479	if ((scalar @fields) == 7) {
	1480	$isThesaurus = 1;
	1481	} else {
	1482	$isThesaurus = 0;
	1483	}
	1484	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
	1485	}
	1486	}
	1487	close IN;
	1488	close WORD;
	1489	close DATA;
	1490	close SAVE if ($savephrases);
	1491
	1492	}
	1493
	1494
	1495
	1496	# sort routines used to renumber phrases
	1497
	1498	sub numerically { $a <=> $b }
	1499
	1500	sub by_doc_frequency {
	1501	my $fa = 1;
	1502	if ($a =~ /,/) {
	1503	$fa = $a;
	1504	$fa =~ s/\d+,//;
	1505	}
	1506	my $fb = 1;
	1507	if ($b =~ /,/) {
	1508	$fb = $b;
	1509	$fb =~ s/\d+,//;
	1510	}
	1511
	1512	return ($fb <=> $fa);
	1513	}
	1514
	1515	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: