Context Navigation

Phind.pm@ 28706

Last change on this file since 28706 was 28560, checked in by ak19, 11 years ago
New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
Property svn:keywords set to `Author Date Id Revision`
File size: 41.6 KB

Rev	Line
[3472]	1	###########################################################################
	2	#
	3	# Phind.pm -- the Phind classifier
	4	#
	5	# Copyright (C) 2000 Gordon W. Paynter
	6	# Copyright (C) 2000 New Zealand Digital Library Project
	7	#
	8	#
	9	# A component of the Greenstone digital library software
	10	# from the New Zealand Digital Library Project at the
	11	# University of Waikato, New Zealand.
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# The Phind clasifier plugin.
	30	# Type "classinfo.pl Phind" at the command line for a summary.
	31
	32	package Phind;
	33
[17209]	34	use BaseClassifier;
[27904]	35	use FileUtils;
[3472]	36	use util;
	37	use ghtml;
	38	use unicode;
	39
[10253]	40	use strict;
	41	no strict 'refs'; # allow filehandles to be variables and viceversa
	42
[3472]	43	my @removedirs = ();
	44
	45	my %wanted_index_files = ('td'=>1,
	46	't'=>1,
	47	'ti'=>1,
	48	'tl'=>1,
	49	'tsd'=>1,
	50	'idb'=>1,
	51	'ib1'=>1,
	52	'ib2'=>1,
	53	'ib3'=>1,
	54	'i'=>1,
	55	'il'=>1,
	56	'w'=>1,
	57	'wa'=>1);
	58
	59	sub BEGIN {
[17209]	60	@Phind::ISA = ('BaseClassifier');
[3472]	61	}
	62
	63	sub END {
	64
	65	# Tidy up stray files - we do this here as there's some weird problem
	66	# preventing us from doing it in the get_classify_info() function (on
	67	# windows at least) where the close() appears to fail on txthandle and
	68	# dochandle, thus preventing us from deleting those files
	69
	70	foreach my $dir (@removedirs) {
	71	if (-d $dir && opendir (DIR, $dir)) {
	72	my @files = readdir DIR;
	73	closedir DIR;
	74
[10253]	75	foreach my $file (@files) {
[3472]	76	next if $file =~ /^\.\.?$/;
	77	my ($suffix) = $file =~ /\.([^\.]+)$/;
	78	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
	79	# delete it!
[27904]	80	&FileUtils::removeFiles (&FileUtils::filenameConcatenate ($dir, $file));
[3472]	81	}
	82	}
	83	}
	84	}
	85	}
	86
[4759]	87	my $arguments =
	88	[ { 'name' => "text",
[4873]	89	'desc' => "{Phind.text}",
[3540]	90	'type' => "string",
[4759]	91	'deft' => "section:Title,section:text",
	92	'reqd' => "no" },
	93	{ 'name' => "title",
[4873]	94	'desc' => "{Phind.title}",
[3540]	95	'type' => "metadata",
[4759]	96	'deft' => "Title",
	97	'reqd' => "no" },
[6968]	98	{ 'name' => "buttonname",
	99	'desc' => "{BasClas.buttonname}",
[3540]	100	'type' => "string",
[4759]	101	'deft' => "Phrase",
	102	'reqd' => "no" },
	103	{ 'name' => "language",
[4873]	104	'desc' => "{Phind.language}",
[6989]	105	'type' => "string",
[4759]	106	'deft' => "en",
	107	'reqd' => "no" },
	108	{ 'name' => "savephrases",
[4873]	109	'desc' => "{Phind.savephrases}",
[3540]	110	'type' => "string",
[4759]	111	'deft' => "",
	112	'reqd' => "no" },
	113	{ 'name' => "suffixmode",
[4873]	114	'desc' => "{Phind.suffixmode}",
[3540]	115	'type' => "int",
[4759]	116	'deft' => "1",
[8362]	117	'range' => "0,1",
[4759]	118	'reqd' => "no" },
[8362]	119	{ 'name' => "min_occurs",
	120	'desc' => "{Phind.min_occurs}",
	121	'type' => "int",
	122	'deft' => "2",
	123	'range' => "1,",
	124	'reqd' => "no" },
[4759]	125	{ 'name' => "thesaurus",
[4873]	126	'desc' => "{Phind.thesaurus}",
[3540]	127	'type' => "string",
[4759]	128	'deft' => "",
	129	'reqd' => "no" },
	130	{ 'name' => "untidy",
[4873]	131	'desc' => "{Phind.untidy}",
[3540]	132	'type' => "flag",
[4759]	133	'reqd' => "no" } ];
[3540]	134
[4759]	135	my $options = { 'name' => "Phind",
[5645]	136	'desc' => "{Phind.desc}",
[6408]	137	'abstract' => "no",
	138	'inherits' => "yes",
[4759]	139	'args' => $arguments };
[3540]	140
[3472]	141
	142	# Phrase delimiter symbols - these should be abstracted out someplace
	143
	144	my $colstart = "COLLECTIONSTART";
	145	my $colend = "COLLECTIONEND";
	146	my $doclimit = "DOCUMENTLIMIT";
	147	my $senlimit = "SENTENCELIMIT";
	148	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
	149
	150
	151	# Create a new Phind browser based on collect.cfg
	152
	153	sub new {
[10218]	154	my ($class) = shift (@_);
	155	my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
	156	push(@$classifierslist, $class);
[3472]	157
[17209]	158	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	159	push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]	160
[17209]	161	my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[6968]	162
[10253]	163	if ($self->{'info_only'}) {
	164	# don't worry about any options etc
	165	return bless $self, $class;
	166	}
	167
[3472]	168	# Ensure the Phind generate scripts are in place
[28560]	169
	170	# the suffix binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
	171	&util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
	172
[27904]	173	my $file1 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
[3472]	174	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
[27904]	175	my $src = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "src", "phind", "generate");
[3472]	176	if (!(-e $file1)) {
	177	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
	178	exit(1);
	179	}
	180
[20454]	181	# things that may have ex. in them that need to be stripped off
	182	$self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
	183	$self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
	184
[10218]	185	# Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
[20454]	186
[10218]	187	$self->{"indexes"} = $self->{"text"};
[3472]	188
[10218]	189	# Further setup
	190	$self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
	191	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
[6982]	192	if (! defined $self->{'builddir'}) {
[27904]	193	$self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "building");
[6982]	194	}
[10218]	195	$self->{'total'} = 0;
	196
[22382]	197	# we set phind to be rtl if language is arabic
	198	if ($self->{'language'} eq "ar") {
	199	$self->{'textorientation'} = "rtl";
	200	}
[10218]	201	# Clean out the unused keys
	202	delete $self->{"text"};
[6982]	203
[3472]	204	return bless $self, $class;
	205	}
	206
	207
	208	# Initialise the Phind classifier
	209
	210	sub init {
	211	my $self = shift (@_);
	212
	213	# ensure we have a build directory
	214	my $builddir = $self->{'builddir'};
	215	die unless (-e "$builddir");
	216
	217	# create Phind directory
	218	my $phnumber = 1;
[27904]	219	my $phinddir = &FileUtils::filenameConcatenate($builddir, "phind1");
[3472]	220	while (-e "$phinddir") {
	221	$phnumber++;
[27904]	222	$phinddir = &FileUtils::filenameConcatenate($builddir, "phind$phnumber");
[3472]	223	}
[27905]	224	&FileUtils::makeDirectory("$phinddir");
[3472]	225	$self->{'phinddir'} = $phinddir;
	226	$self->{'phindnumber'} = $phnumber;
	227
	228	push(@removedirs, $phinddir) unless $self->{'untidy'};
	229
	230	# open filehandles for documents and text
[27904]	231	my $clausefile = &FileUtils::filenameConcatenate("$phinddir", "clauses");
	232	&FileUtils::removeFiles($clausefile) if (-e $clausefile);
[3472]	233
	234	my $txthandle = 'TEXT' . $phnumber;
	235	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
	236	$self->{'txthandle'} = $txthandle;
	237
[27904]	238	my $docfile = &FileUtils::filenameConcatenate("$phinddir", "docs.txt");
	239	&FileUtils::removeFiles($docfile) if (-e $docfile);
[3472]	240
	241	my $dochandle = 'DOC' . $phnumber;
	242	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
	243	$self->{'dochandle'} = $dochandle;
	244
	245	}
	246
	247
	248	# Classify each document.
	249	#
	250	# Each document is passed here in turn. The classifier extracts the
	251	# text of each and stores it in the clauses file. Document details are
	252	# stored in the docs.txt file.
	253
	254	sub classify {
	255	my $self = shift (@_);
[23116]	256	my ($doc_obj) = @_;
[3472]	257
	258	my $verbosity = $self->{'verbosity'};
	259	my $top_section = $doc_obj->get_top_section();
	260
[10218]	261	my $titlefield = $self->{'title'};
[3472]	262
	263	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[3536]	264	if (!defined($title)) {
	265	$title = "";
	266	print STDERR "Phind: document has no title\n";
	267	}
[3472]	268	print "process: $title\n" if ($verbosity > 2);
	269
	270	# Only consider the file if it is in the correct language
	271	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
[10218]	272	my $phrlanguage = $self->{'language'};
[3472]	273	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
[18455]	274
[3472]	275	# record this file
	276	$self->{'total'} ++;
[10253]	277	# what is $file ???
	278	# print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[3472]	279
	280	# Store document details
	281	my $OID = $doc_obj->get_OID();
	282	$OID = "NULL" unless defined $OID;
	283	my $dochandle = $self->{'dochandle'};
	284	print $dochandle "<Document>\t$OID\t$title\n";
	285
	286	# Store the text occuring in this object
	287
	288	# output the document delimiter
	289	my $txthandle = $self->{'txthandle'};
	290	print $txthandle "$doclimit\n";
	291
[8362]	292	# iterate over the required indexes and store their text
[3472]	293	my $indexes = $self->{'indexes'};
	294	my $text = "";
	295	my ($part, $level, $field, $section, $data, $dataref);
	296
	297	foreach $part (split(/,/, $indexes)) {
	298
	299	# Each field has a level and a data element ((e.g. document:Title)
	300	($level, $field) = split(/:/, $part);
	301	die unless ($level && $field);
	302
	303	# Extract the text from every section
[3540]	304	# (In phind, document:text and section:text are equivalent)
[3472]	305	if ($field eq "text") {
	306	$data = "";
	307	$section = $doc_obj->get_top_section();
	308	while (defined($section)) {
	309	$data .= $doc_obj->get_text($section) . "\n";
	310	$section = $doc_obj->get_next_section($section);
	311	}
	312	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	313	}
	314
	315	# Extract a metadata field from a document
	316	# (If there is more than one element of the given type, get them all.)
	317	elsif ($level eq "document") {
	318	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
	319	foreach $data (@$dataref) {
	320	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	321	}
	322	}
	323
	324	# Extract metadata from every section in a document
	325	elsif ($level eq "section") {
	326	$data = "";
	327	$section = $doc_obj->get_top_section();
	328	while (defined($section)) {
	329	$dataref = $doc_obj->get_metadata($section, $field);
	330	$data .= join("\n", @$dataref) . "\n";
	331	$section = $doc_obj->get_next_section($section);
	332	}
	333	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
	334	}
	335
	336	# Some sort of specification which I don't understand
	337	else {
	338	die "Unknown level ($level) in Phind index ($part)\n";
	339	}
	340
	341	}
	342
	343	# output the text
	344	$text =~ tr/\n//s;
	345	print $txthandle "$text";
	346	}
	347
	348
	349	# Construct the classifier from the information already gathered
	350	#
	351	# When get_classify_info is called, the clauses and docs.txt files have
[3540]	352	# already been constructed in the Phind directory. This function will
[3472]	353	# translate them into compressed, indexed MGPP files that can be read by
	354	# the phindcgi script. It will also register our classifier so that it
	355	# shows up in the navigation bar.
	356
	357	sub get_classify_info {
	358	my $self = shift (@_);
[6408]	359	my ($gli) = @_;
[3472]	360
	361	close $self->{'dochandle'};
	362	close $self->{'txthandle'};
	363	my $verbosity = $self->{'verbosity'};
	364	my $out = $self->{'outhandle'};
	365	my $phinddir = $self->{'phinddir'};
	366
	367	my $osextra = "";
	368	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
	369	$osextra = " -d /";
	370	}
	371
[6408]	372	print STDERR "</Stage>\n" if $gli;
	373
[3472]	374	if ($verbosity) {
	375	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
	376	print $out "*** in ", $self->{'phinddir'}, "\n";
	377	}
	378
[6408]	379	print STDERR "<Stage name='Phind'>\n" if $gli;
	380
[3540]	381	# Construct phind indexes
[3472]	382	my $suffixmode = $self->{'suffixmode'};
[8362]	383	my $min_occurs = $self->{'min_occurs'};
[3472]	384	my ($command, $status);
	385
	386	# Generate the vocabulary, symbol statistics, and numbers file
	387	# from the clauses file
	388	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[6408]	389	print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
[3472]	390	&extract_vocabulary($self);
[3540]	391
[3472]	392	# Use the suffix program to generate the phind/phrases file
	393	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[6408]	394	print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
[8362]	395	&execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
[3472]	396
	397	# check that we generated some files. It's not necessarily an error if
	398	# we didn't (execute() would have quit on error), but we can't go on.
[27904]	399	my $phrasesfile=&FileUtils::filenameConcatenate($self->{'phinddir'}, 'phrases');
[3472]	400	if (! -r $phrasesfile) {
[6408]	401	print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
[3472]	402	print $out "\nNo phrases found for Phind classifier!\n";
	403	return;
[3540]	404	}
[3472]	405
	406	# Create the phrase file and put phrase numbers in phind/phrases
	407	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[6408]	408	print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
[3472]	409	&renumber_phrases($self);
	410
	411	print $out "\nCreating phrase databases\n";
[6408]	412	print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
[27904]	413	my $mg_input = &FileUtils::filenameConcatenate($phinddir, "pdata.txt");
	414	my $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pdata");
[3472]	415
	416	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	417	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	418	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	419
	420	# create the mg index of words
	421	print $out "\nCreating word-level search indexes\n";
[6408]	422	print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
[27904]	423	$mg_input = &FileUtils::filenameConcatenate($phinddir, "pword.txt");
	424	$mg_stem = &FileUtils::filenameConcatenate($phinddir, "pword");
[3472]	425
	426	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
	427	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	428	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	429	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
	430	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
	431	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	432
	433	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
	434	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
	435	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
	436
	437	# create the mg document information database
	438	print $out "\nCreating document information databases\n";
[6408]	439	print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
[27904]	440	$mg_input = &FileUtils::filenameConcatenate($phinddir, "docs.txt");
	441	$mg_stem = &FileUtils::filenameConcatenate($phinddir, "docs");
[3472]	442
	443	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
	444	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
	445	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
	446
[22382]	447	my $parameters = "phindnumber=$self->{'phindnumber'}";
	448	if (defined ($self->{'textorientation'})) {
	449	$parameters .= ";textorientation=$self->{'textorientation'}";
	450	}
[3472]	451	# Return the information about the classifier that we'll later want to
	452	# use to create macros when the Phind classifier document is displayed.
	453	my %classifyinfo = ('thistype'=>'Invisible',
	454	'childtype'=>'Phind',
	455	'Title'=>$self->{'buttonname'},
[22382]	456	'parameters'=>$parameters,
[3472]	457	'contains'=>[]);
	458
	459	my $collection = $self->{'collection'};
	460	my $url = "library?a=p&p=phind&c=$collection";
	461	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
	462
	463	return \%classifyinfo;
	464	}
	465
	466
	467
	468	sub convert_gml_to_tokens {
	469
	470	my ($language_exp, $text) = @_;
	471
	472	# escape any magic words... - jrm21
	473	foreach my $delim (@delimiters) {
	474	my $replacement=lc($delim);
	475	my $num= $text=~ s/$delim/$replacement/g;
	476	if (!$num) {$num=0;}
	477	}
	478
[21876]	479	if ($language_exp =~ /^en$/) {
[3472]	480	return &convert_gml_to_tokens_EN($text);
	481	}
[3540]	482
[3472]	483	if ($language_exp =~ /zh/) {
	484	return &convert_gml_to_tokens_ZH($text);
[3540]	485	}
[3472]	486
	487	$_ = $text;
	488
	489	# 1. remove GML tags
	490
	491	# Remove everything that is in a tag
	492	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	493	s/\s<br>\s/ LINEBREAK /isgo;
	494	s/<[^>]*>/ /sgo;
	495
	496	# Now we have the text, but it may contain HTML
	497	# elements coded as > etc. Remove these tags.
	498	s/&/&/sgo;
	499	s/</</sgo;
	500	s/>/>/sgo;
	501	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
	502	s/\s<br>\s/ LINEBREAK /isgo;
	503	s/<[^>]*>/ /sgo;
	504
	505	# replace<p> and <br> placeholders with clause break symbol (\n)
	506	s/\s+/ /gso;
	507	s/PARAGRAPHBREAK/\n/sgo;
	508	s/LINEBREAK/\n/sgo;
	509
	510
	511	# 2. Split the remaining text into space-delimited tokens
	512
[3540]	513	# Convert any HTML special characters (like ") to their UTF8 equivalent
	514	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[3472]	515
	516	# Split text at word boundaries
	517	s/\b/ /go;
[21876]	518
[3472]	519	# 3. Convert the remaining text to "clause format"
	520
	521	# Insert newline if the end of a sentence is detected
	522	# (delimter is: "[\.\?\!]\s")
	523	# s/\s*[\.\?\!]\s+/\n/go;
	524
	525	# remove unnecessary punctuation and replace with clause break symbol (\n)
[21876]	526	# the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
	527	#s/[^\w ]/\n/go;
	528	# replace punct with new lines - is this what we want??
	529	s/\s[\?\;\:\!\,\.\"\[\]\{\}]\s/\n/go; #"
	530	# then remove other punct with space
	531	s/[\'\`\\\_]/ /go;
[3472]	532
	533	# remove extraneous whitespace
	534	s/ +/ /sgo;
	535	s/^\s+//mgo;
	536	s/\s*$/\n/mgo;
	537
	538	# remove lines that contain one word or less
	539	s/^\S*$//mgo;
	540	s/^\s*$//mgo;
	541	tr/\n//s;
	542
	543	return $_;
	544	}
	545
	546	# a chinese version
	547	sub convert_gml_to_tokens_ZH {
	548
	549	$_ = shift @_;
	550
	551	# Replace all whitespace with a simple space
	552	s/\s+/ /gs;
	553	# Remove everything that is in a tag
	554	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	555	s/\s<br>\s/ LINEBREAK /isg;
	556	s/<[^>]*>/ /sg;
	557
	558	# Now we have the text, but it may contain HTML
	559	# elements coded as > etc. Remove these tags.
	560	s/</</sg;
	561	s/>/>/sg;
	562
	563	s/\s+/ /sg;
	564	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	565	s/\s<br>\s/ LINEBREAK /isg;
	566	s/<[^>]*>/ /sg;
	567
	568	# remove & and other miscellaneous markup tags
	569	s/&/&/sg;
	570	s/</</sg;
	571	s/>/>/sg;
	572	s/&/&/sg;
	573
	574	# replace<p> and <br> placeholders with carriage returns
	575	s/PARAGRAPHBREAK/\n/sg;
	576	s/LINEBREAK/\n/sg;
	577
	578
	579	# print STDERR "text:$_\n";
	580	return $_;
	581	}
[3540]	582
[3472]	583	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
	584
	585	sub convert_gml_to_tokens_EN {
	586	$_ = shift @_;
	587
	588	# FIRST, remove GML tags
	589
	590	# Replace all whitespace with a simple space
	591	s/\s+/ /gs;
	592
	593	# Remove everything that is in a tag
	594	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	595	s/\s<br>\s/ LINEBREAK /isg;
	596	s/<[^>]*>/ /sg;
	597
	598	# Now we have the text, but it may contain HTML
	599	# elements coded as > etc. Remove these tags.
	600	s/</</sg;
	601	s/>/>/sg;
	602
	603	s/\s+/ /sg;
	604	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	605	s/\s<br>\s/ LINEBREAK /isg;
	606	s/<[^>]*>/ /sg;
	607
	608	# remove & and other miscellaneous markup tags
	609	s/&/&/sg;
	610	s/</</sg;
	611	s/>/>/sg;
	612	s/&/&/sg;
	613
	614	# replace<p> and <br> placeholders with carriage returns
	615	s/PARAGRAPHBREAK/\n/sg;
	616	s/LINEBREAK/\n/sg;
	617
	618
	619	# Exceptional punctuation
	620	#
	621	# We make special cases of some punctuation
	622
	623	# remove any apostrophe that indicates omitted letters
	624	s/(\w+)\'(\w*\s)/ $1$2 /g;
	625
	626	# remove period that appears in a person's initals
	627	s/\s([A-Z])\./ $1 /g;
	628
	629	# replace hyphens in hypheanted words and names with a space
	630	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
	631
	632	# Convert the remaining text to "clause format",
	633	# This means removing all excess punctuation and garbage text,
	634	# normalising valid punctuation to fullstops and commas,
	635	# then putting one cluse on each line.
	636
	637	# Insert newline when the end of a sentence is detected
	638	# (delimter is: "[\.\?\!]\s")
	639	s/\s*[\.\?\!]\s+/\n/g;
	640
	641	# split numbers after four digits
	642	s/(\d\d\d\d)/$1 /g;
	643
	644	# split words after 32 characters
	645
	646	# squash repeated punctuation
	647	tr/A-Za-z0-9 //cs;
	648
	649	# save email addresses
	650	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
	651
	652	# normalise clause breaks (mostly punctuation symbols) to commas
	653	s/[^A-Za-z0-9 \n]+/ , /g;
	654
	655	# Remove repeated commas, and replace with newline
	656	s/\s*,[, ]+/\n/g;
	657
	658	# remove extra whitespace
	659	s/ +/ /sg;
	660	s/^\s+//mg;
	661	s/\s*$/\n/mg;
	662
	663	# remove lines that contain one word or less
	664	s/^\w*$//mg;
	665	s/^\s*$//mg;
	666	tr/\n//s;
	667
	668	return $_;
	669
	670	}
	671
	672
	673
	674	# Execute a system command
	675
	676	sub execute {
	677	my ($command, $verbosity, $outhandle) = @_;
	678	print $outhandle "Executing: $command\n" if ($verbosity > 2);
	679	$! = 0;
	680	my $status = system($command);
	681	if ($status != 0) {
	682	print STDERR "Phind - Error executing '$command': $!\n";
[3540]	683	exit($status); # this causes the build to fail...
[3472]	684	}
	685	}
	686
	687
	688	# Generate the vocabulary, symbol statistics, and numbers file from the
	689	# clauses file. This is legacy code, so is a bit messy and probably wont
	690	# run under windows.
	691
	692	sub extract_vocabulary {
	693	my ($self) = @_;
	694
	695	my $verbosity = $self->{'verbosity'};
	696	my $out = $self->{'outhandle'};
	697
	698	my $collectiondir = $self->{'collectiondir'};
	699	my $phinddir = $self->{'phinddir'};
	700
[10218]	701	my $language_exp = $self->{'language'};
[3472]	702
	703	my ($w, $l, $line, $word);
	704
	705	my ($first_delimiter, $last_delimiter,
	706	$first_stopword, $last_stopword,
	707	$first_extractword, $last_extractword,
	708	$first_contentword, $last_contentword,
	709	$phrasedelimiter);
	710
	711	my $thesaurus = $self->{'thesaurus'};
	712	my ($thesaurus_links, $thesaurus_terms,
	713	%thesaurus, $first_thesaurusword, $last_thesaurusword);
	714
	715	my %symbol;
	716	my (%freq);
	717
	718	print $out "Calculating vocabulary\n" if ($verbosity > 1);
	719
	720	# Read and store the stopwords
[27904]	721	my $stopdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
[3472]	722	my $stopword_files = ();
	723	my ($language, $language_dir, $file, $file_name);
	724	my %stopwords;
	725
	726	# Examine each directory in the stopword directory
	727	opendir(STOPDIR, $stopdir);
	728	foreach $language (readdir STOPDIR) {
	729
	730	# Ignore entries that do not match the classifier's language
	731	next unless ($language =~ /$language_exp/);
[27904]	732	$language_dir = &FileUtils::filenameConcatenate($stopdir, $language);
[3472]	733	next unless (-d "$language_dir");
	734
	735	opendir(LANGDIR, $language_dir);
	736	foreach $file (readdir LANGDIR) {
	737
	738	# Ignore entries that are not stopword files
	739	next unless ($file =~ /sw$/);
[27904]	740	$file_name = &FileUtils::filenameConcatenate($language_dir, $file);
[3472]	741	next unless (-f "$file_name");
	742
	743	# Read the stopwords
	744	open(STOPFILE, "<$file_name");
	745	while (<STOPFILE>) {
	746	s/^\s+//;
	747	s/\s.*//;
	748	$word = $_;
	749	$l = lc($word);
	750	$stopwords{$l} = $word;
	751	}
	752	close STOPFILE;
	753
	754	}
	755	closedir LANGDIR;
	756	}
	757	closedir STOPDIR;
	758
	759	# Read thesaurus information
	760	if ($thesaurus) {
	761
	762	# link file exists
[27904]	763	$thesaurus_links = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.lnk");
[3472]	764	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
	765
	766	# ensure term file exists in the correct language
	767	if ($language_exp =~ /^([a-z][a-z])/) {
	768	$language = $1;
	769	} else {
	770	$language = 'en';
	771	}
[27904]	772	$thesaurus_terms = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.$language");
[3472]	773	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
	774
	775
	776	# Read the thesaurus terms
	777	open(TH, "<$thesaurus_terms");
	778	while(<TH>) {
	779	s/^\d+ //;
	780	s/$.*$//;
	781	foreach $w (split(/\s+/, $_)) {
	782	$thesaurus{lc($w)} = $w;
	783	}
	784	}
	785	close TH;
	786	}
	787
	788	# Read words in the text and count occurences
	789	open(TXT, "<$phinddir/clauses");
	790
	791	my @words;
	792	while(<TXT>) {
	793	$line = $_;
	794	next unless ($line =~ /./);
	795
	796	@words = split(/\s+/, $line);
	797	foreach $w (@words) {
	798	$l = lc($w);
	799	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
	800	$freq{$w}++;
	801	}
	802	$freq{$senlimit}++;
	803	}
	804
	805	close TXT;
	806
	807	# Calculate the "best" form of each word
	808	my (%bestform, %totalfreq, %bestfreq);
	809
	810	foreach $w (sort (keys %freq)) {
	811	$l = lc($w);
	812
	813	# totalfreq is the number of times a term appears in any form
	814	$totalfreq{$l} += $freq{$w};
	815
	816	if (defined $stopwords{$l}) {
	817	$bestform{$l} = $stopwords{$l};
	818
	819	} elsif (defined $thesaurus{$l}) {
	820	$bestform{$l} = $thesaurus{$l};
	821
	822	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
	823	$bestfreq{$l} = $freq{$w};
	824	$bestform{$l} = $w;
	825	}
	826	}
	827	undef %freq;
	828	undef %bestfreq;
	829
	830
	831	# Assign symbol numbers to tokens
	832	my $nextsymbol = 1;
	833	my (@vocab);
	834
	835	# Delimiters
	836	$first_delimiter = 1;
	837
	838	foreach $word (@delimiters) {
	839
	840	# $word = lc($word); # jrm21
	841	$word = uc($word);
	842	$bestform{$word} = $word;
	843	$vocab[$nextsymbol] = $word;
	844	$symbol{$word} = $nextsymbol;
	845	$nextsymbol++;
	846	}
	847	$last_delimiter = $nextsymbol - 1;
	848	# Stopwords
	849	$first_stopword = $nextsymbol;
	850
	851	foreach my $word (sort keys %stopwords) {
	852	# don't include stopword unless it occurs in the text
	853	$word = lc($word);
	854	next unless ($totalfreq{$word});
	855	next if ($symbol{$word});
	856
	857	$vocab[$nextsymbol] = $word;
	858	$symbol{$word} = $nextsymbol;
	859	$nextsymbol++;
	860	}
	861	$last_stopword = $nextsymbol - 1;
	862	$first_contentword = $nextsymbol;
	863
	864	# Thesaurus terms
	865	if ($thesaurus) {
	866	$first_thesaurusword = $nextsymbol;
	867
	868	foreach my $word (sort keys %thesaurus) {
	869
	870	$word = lc($word);
	871	next if ($symbol{$word});
	872	$bestform{$word} = $thesaurus{$word};
	873
	874	$vocab[$nextsymbol] = $word;
	875	$symbol{$word} = $nextsymbol;
	876	$nextsymbol++;
	877
	878	}
	879	$last_thesaurusword = $nextsymbol - 1;
	880	}
	881
	882	# Other content words
	883	$first_extractword = $nextsymbol;
	884
	885	foreach my $word (sort (keys %bestform)) {
	886
	887	next if ($symbol{$word});
	888
	889	$vocab[$nextsymbol] = $word;
	890	$symbol{$word} = $nextsymbol;
	891	$nextsymbol++;
	892	}
	893	$last_extractword = $nextsymbol - 1;
	894	$last_contentword = $nextsymbol - 1;
	895
	896	# Outut the words
	897	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
	898	open(VOC, ">$phinddir/clauses.vocab");
	899
	900	for (my $i = 1; $i < $nextsymbol; $i++) {
	901	$w = $vocab[$i];
	902
	903	print VOC "$bestform{$w}\n";
	904	$totalfreq{$w} = 0 unless ($totalfreq{$w});
	905	}
	906	close VOC;
	907
	908
	909	# Create statistics file
	910	# Output statistics about the vocablary
	911	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
[27904]	912	&FileUtils::removeFiles("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
[3472]	913
	914	open(STAT, ">$phinddir/clauses.stats")
	915	\|\| die "Cannot open $phinddir/clauses.stats: $!";
	916
	917	print STAT "first_delimiter $first_delimiter\n";
	918	print STAT "last_delimiter $last_delimiter\n";
	919	print STAT "first_stopword $first_stopword\n";
	920	print STAT "last_stopword $last_stopword\n";
	921	if ($thesaurus) {
	922	print STAT "first_thesaurusword $first_thesaurusword\n";
	923	print STAT "last_thesaurusword $last_thesaurusword\n";
	924	}
	925	print STAT "first_extractword $first_extractword\n";
	926	print STAT "last_extractword $last_extractword\n";
	927	print STAT "first_contentword $first_contentword\n";
	928	print STAT "last_contentword $last_contentword\n";
	929	print STAT "first_symbol $first_delimiter\n";
	930	print STAT "last_symbol $last_contentword\n";
	931	print STAT "first_word $first_stopword\n";
	932	print STAT "last_word $last_contentword\n";
	933	close STAT;
	934
	935	undef @vocab;
	936
	937
	938	# Create numbers file
	939	# Save text as symbol numbers
	940	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
	941
	942	open(TXT, "<$phinddir/clauses");
	943	open(NUM, ">$phinddir/clauses.numbers");
	944
	945	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
	946	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
	947	$phrasedelimiter = $symbol{$senlimit};
	948	print NUM "$symbol{$colstart}\n";
	949
	950	# set up the special symbols that delimit documents and sentences
	951	while(<TXT>) {
	952
	953	# split sentence into a list of tokens
	954	$line = $_;
	955	next unless ($line =~ /./);
	956	@words = split(/\s+/, $line);
	957
	958	# output one token at a time
	959	foreach $word (@words) {
	960	# don't lower-case special delimiters - jrm21
	961	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
	962	$word = lc($word);
	963	}
	964	print NUM "$symbol{$word}\n";
	965	}
	966
	967	# output phrase delimiter
	968	print NUM "$phrasedelimiter\n";
	969	}
	970
	971	close TXT;
	972	# print NUM "$symbol{lc($colend)}\n";# jrm21
	973	print NUM "$symbol{$colend}\n";
	974	close NUM;
	975
	976	# Save thesaurus data in one convienient file
	977	if ($thesaurus) {
	978
[27904]	979	my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
[3472]	980
	981
	982	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
	983	if ($verbosity > 1);
	984
	985	# Read the thesaurus terms
	986	my ($num, $text, %thes_symbols);
	987
	988	open(TH, "<$thesaurus_terms");
	989	while(<TH>) {
	990	chomp;
	991	@words = split(/\s+/, $_);
	992	$num = shift @words;
	993	$text = "";
	994
	995	# translate words into symbol numbers
	996	foreach $word (@words) {
	997	$word = lc($word);
	998	if ($symbol{$word}) {
	999	$text .= "s$symbol{$word} ";
	1000	} elsif ($verbosity) {
	1001	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
	1002	}
	1003	}
	1004	$text =~ s/ $//;
	1005	$thes_symbols{$num} = $text;
	1006	}
	1007	close TH;
	1008
	1009	# Read the thesaurus links and write the corresponding data
	1010	open(TH, "<$thesaurus_links");
	1011	open(THOUT, ">$thesaurusfile");
	1012
	1013	while(<TH>) {
	1014	chomp;
	1015	($num, $text) = split(/:/, $_);
	1016
	1017	if (defined($thes_symbols{$num})) {
	1018	print THOUT "$num:$thes_symbols{$num}:$text\n";
	1019	} else {
	1020	print THOUT "$num:untranslated:$text\n";
	1021	}
	1022	}
	1023	close TH;
	1024	close THOUT;
	1025	}
	1026
	1027
	1028
	1029
	1030	}
	1031
	1032
	1033	# renumber_phrases
	1034	#
	1035	# Prepare the phrases file to be input to mgpp. The biggest problem is
	1036	# reconciling the phrase identifiers used by the suffix program (which
	1037	# we'll call suffix-id numbers) with the numbers used in the thesaurus
	1038	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
	1039	# Phind-id numbers must be sorted by frequency of occurance.
	1040	#
	1041	# Start creating a set of phind-id numbers from the sorted suffix-id
	1042	# numbers and (if required) the thesaurus-id numbers. Then add any other
	1043	# phrases occuring in the thesaurus.
	1044	#
	1045	# The last thing we have to do is restore the vocabulary information to the
	1046	# phrase file so that the phrases are stored as words, not as symbol
	1047	# numbers.
	1048
	1049	# The original phrases file looks something like this:
	1050	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
	1051	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
	1052	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
	1053	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
	1054
	1055
	1056	sub renumber_phrases {
	1057	my ($self) = @_;
	1058
	1059	renumber_suffix_data($self);
	1060	renumber_thesaurus_data($self);
	1061	restore_vocabulary_data($self);
	1062
	1063	}
	1064
	1065
	1066
	1067	# renumber_suffix_data
	1068	#
	1069	# Translate phrases file to phrases.2 using phind keys instead
	1070	# of suffix keys and sorting the expansion data.
	1071
	1072	sub renumber_suffix_data {
	1073	my ($self) = @_;
	1074
	1075	my $verbosity = $self->{'verbosity'};
	1076	my $out = $self->{'outhandle'};
	1077	print $out "Translate phrases: suffix-ids become phind-id's\n"
	1078	if ($verbosity);
	1079
	1080	my $phinddir = $self->{'phinddir'};
[27904]	1081	my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases');
	1082	my $outfile = &FileUtils::filenameConcatenate($phinddir, 'phrases.2');
[3472]	1083
	1084	# Read the phrase file. Calculate initial set of phind-id
	1085	# numbers and store (suffixid -> frequency) relation.
	1086
	1087	my %suffixtophind;
[10253]	1088	my @totalfrequency;
[3472]	1089	my (@fields, $suffixid);
	1090	my $nextphind = 1;
	1091
	1092	open(IN, "<$infile");
	1093	while(<IN>) {
	1094
	1095	chomp;
	1096	@fields = split(/:/, $_);
	1097
	1098	# get next suffixid and phindid
	1099	$suffixid = shift @fields;
	1100	$suffixtophind{$suffixid} = $nextphind;
	1101
	1102	# store total frequency
	1103	shift @fields;
	1104	$totalfrequency[$nextphind] = shift @fields;
	1105
	1106	$nextphind++;
	1107	}
	1108	close IN;
	1109
	1110
	1111	# Translate phrases file to phrases.2. Use phind keys (not suffix
	1112	# keys), sort expansion and document occurance data in order of
	1113	# descending frequency..
	1114	open(IN, "<$infile");
	1115	open(OUT, ">$outfile");
	1116
	1117	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
	1118	my (@documwents, @newexp, $k, $n);
	1119	my $linenumber = 0;
	1120
	1121	while(<IN>) {
	1122
	1123	# read the line
	1124	chomp;
	1125	@fields = split(/:/, $_);
	1126
	1127	# get a phrase number for this line
	1128	$suffixid = shift @fields;
	1129	die unless (defined($suffixtophind{$suffixid}));
	1130	$phindid = $suffixtophind{$suffixid};
	1131
	1132	# get the symbols in the phrase
	1133	$text = shift @fields;
	1134
	1135	# output status information
	1136	$linenumber++;
	1137	if ($verbosity > 2) {
	1138	if ($linenumber % 1000 == 0) {
	1139	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
[10253]	1140	}
	1141	# what are $num and $key??
	1142	#print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[3472]	1143	}
	1144
	1145	# get the phrase frequency
	1146	$tf = shift @fields;
	1147
	1148	# get the number of expansions
	1149	$countexp = shift @fields;
	1150
	1151	# get the expansions, convert them into phind-id numbers, and sort them
	1152	$expansions = shift @fields;
	1153	@newexp = ();
	1154	foreach $k (split(/,/, $expansions)) {
	1155	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
	1156	$n = $suffixtophind{$k};
	1157	push @newexp, $n;
	1158	}
	1159	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
	1160
	1161	# get the number of documents
	1162	$countdocs = shift @fields;
	1163
	1164	# get the documents and sort them
	1165	$documents = shift @fields;
	1166	$documents =~ s/d//g;
[10253]	1167	my @documents = split(/;/, $documents);
[3472]	1168	@documents = sort by_doc_frequency @documents;
	1169
	1170	# output the phrase data
	1171	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
	1172	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
	1173
	1174	}
	1175
	1176	close IN;
	1177	close OUT;
	1178	}
	1179
	1180
	1181	# renumber_thesaurus_data
	1182	#
	1183	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
	1184
	1185	sub renumber_thesaurus_data {
	1186	my ($self) = @_;
	1187
	1188	my $out = $self->{'outhandle'};
	1189	my $verbosity = $self->{'verbosity'};
	1190	my $thesaurus = $self->{'thesaurus'};
	1191
	1192	my $phinddir = $self->{'phinddir'};
[27904]	1193	my $infile = &FileUtils::filenameConcatenate($phinddir, "phrases.2");
	1194	my $outfile = &FileUtils::filenameConcatenate($phinddir, "phrases.3");
[3472]	1195
	1196
	1197	# If no thesaurus is defined, simply move the phrases file.
	1198	if (!$thesaurus) {
	1199	print $out "Translate phrases.2: no thesaurus data\n"
	1200	if ($verbosity);
[27904]	1201	&FileUtils::moveFiles($infile, $outfile);
[3472]	1202	return;
	1203	}
	1204
	1205	print $out "Translate phrases.2: add thesaurus data\n"
	1206	if ($verbosity);
	1207
	1208	# 1.
	1209	# Read thesaurus file and store (symbols->thesaurusid) mapping
[27904]	1210	my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
[3472]	1211	my %symbolstothesid;
	1212	my (@fields, $thesid, $symbols);
	1213
	1214	open(TH, "<$thesaurusfile");
	1215
	1216	while (<TH>) {
	1217
	1218	chomp;
	1219	@fields = split(/:/, $_);
	1220
	1221	# get id and text
	1222	$thesid = shift @fields;
	1223	$symbols = shift @fields;
	1224	$symbolstothesid{$symbols} = $thesid;
	1225	}
	1226	close TH;
	1227
	1228	# 2.
	1229	# Read phrases file to find thesaurus entries that already
	1230	# have a phindid. Store their phind-ids for later translation,
	1231	# and store their frequency for later sorting.
	1232	my %thesaurustophindid;
	1233	my %phindidtofrequency;
	1234	my ($phindid, $freq);
	1235
	1236	open(IN, "<$infile");
	1237
	1238	while(<IN>) {
	1239
	1240	chomp;
	1241	@fields = split(/:/, $_);
	1242
	1243	# phindid and symbols for this line
	1244	$phindid = shift @fields;
	1245	$symbols = shift @fields;
	1246	$freq = shift @fields;
	1247
	1248	# do we have a thesaurus id corresponding to this phrase?
	1249	if (defined($symbolstothesid{$symbols})) {
	1250	$thesid = $symbolstothesid{$symbols};
	1251	$thesaurustophindid{$thesid} = $phindid;
	1252	$phindidtofrequency{$phindid} = $freq;
	1253	}
	1254	}
	1255	close IN;
	1256
	1257	undef %symbolstothesid;
	1258
	1259	# 3.
	1260	# Create phind-id numbers for remaining thesaurus entries,
	1261	# and note that their frequency is 0 for later sorting.
	1262	my $nextphindid = $phindid + 1;
	1263
	1264	open(TH, "<$thesaurusfile");
	1265	while(<TH>) {
	1266
	1267	chomp;
	1268	@fields = split(/:/, $_);
	1269
	1270	# read thesaurus-id and ensure it has a corresponding phind-id
	1271	$thesid = shift @fields;
	1272	if (!defined($thesaurustophindid{$thesid})) {
	1273	$thesaurustophindid{$thesid} = $nextphindid;
	1274	$phindidtofrequency{$nextphindid} = 0;
	1275	$nextphindid++;
	1276	}
	1277	}
	1278	close TH;
	1279
	1280	# 4.
	1281	# Translate thesaurus file, replacing thesaurus-id numbers with
	1282	# phind-id numbers.
[27904]	1283	my $newthesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.phindid");
[3472]	1284	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
	1285	my (@links, $linkid, %linkidtotype, $newrelation);
	1286
	1287	open(TH, "<$thesaurusfile");
	1288	open(TO, ">$newthesaurusfile");
	1289	while(<TH>) {
	1290
	1291	chomp;
	1292	@fields = split(/:/, $_);
	1293
	1294	# phindid and symbols for this line
	1295	($thesid, $symbols, $relations) = @fields;
	1296
	1297	die unless ($thesid && $symbols);
	1298	die unless $thesaurustophindid{$thesid};
	1299	$phindid = $thesaurustophindid{$thesid};
	1300
	1301	# convert each part of the relation string to use phind-id numbers
	1302	# at the same time, we want to sort the list by frequency.
	1303	undef %linkidtotype;
	1304
	1305	foreach $linktext (split(/;/, $relations)) {
	1306	@linkdata = split(/,/, $linktext);
	1307
	1308	# remember the linktype (e.g. BT, NT)
	1309	$linktype = shift @linkdata;
	1310
	1311	# store the type of each link
	1312	foreach $thesid (@linkdata) {
	1313	die unless (defined($thesaurustophindid{$thesid}));
	1314	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
	1315	}
	1316	}
	1317
	1318	# sort the list of links, first by frequency, then by type.
	1319	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
	1320	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
	1321	$linkcounter = (scalar @links);
	1322
	1323	# create a string describing the link information
	1324	$linktype = $linkidtotype{$links[0]};
	1325	$newrelation = $linktype;
	1326	foreach $linkid (@links) {
	1327	if ($linkidtotype{$linkid} ne $linktype) {
	1328	$linktype = $linkidtotype{$linkid};
	1329	$newrelation .= ";" . $linktype;
	1330	}
	1331	$newrelation .= "," . $linkid;
	1332	}
	1333	$newrelation .= ";";
	1334
	1335
	1336	# output the new line
	1337	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
	1338	}
	1339	close TH;
	1340	close TO;
	1341
	1342	undef %thesaurustophindid;
	1343	undef %linkidtotype;
	1344	undef %phindidtofrequency;
	1345
	1346	# 5.
	1347	# Read thesaurus data (in phind-id format) into memory
	1348	my %thesaurusdata;
	1349
	1350	open(TH, "<$newthesaurusfile");
	1351	while(<TH>) {
	1352	chomp;
	1353	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
	1354	die unless ($phindid && $symbols);
	1355	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
	1356	}
	1357	close TH;
	1358
	1359	# 6.
	1360	# Add thesaurus data to phrases file
	1361	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
	1362	my (@documwents, @newexp, $k, $n);
	1363	my $linenumber = 0;
	1364
	1365	open(IN, "<$infile");
	1366	open(OUT, ">$outfile");
	1367
	1368	# Update existing phrases
	1369	while(<IN>) {
	1370
	1371	chomp;
	1372	@fields = split(/:/, $_);
	1373
	1374	# get data for this line
	1375	$phindid = shift @fields;
	1376
	1377	# output the phrase data, with thesaurus information
	1378	print OUT "$phindid:", join(":", @fields);
	1379
	1380	# add thesaurus data
	1381	if (defined($thesaurusdata{$phindid})) {
	1382	@fields = split(/:/, $thesaurusdata{$phindid});
	1383	shift @fields;
	1384	$linkcounter = shift @fields;
	1385	$relations = shift @fields;
	1386
	1387	print OUT ":$linkcounter:$relations";
	1388	$thesaurusdata{$phindid} = "";
	1389	}
	1390	print OUT "\n";
	1391	}
	1392	close IN;
	1393
	1394	# Add phrases that aren't already in the file
	1395	foreach $phindid (sort numerically keys %thesaurusdata) {
	1396	next unless ($thesaurusdata{$phindid});
	1397
	1398	@fields = split(/:/, $thesaurusdata{$phindid});
	1399	$symbols = shift @fields;
	1400	$linkcounter = shift @fields;
	1401	$relations = shift @fields;
	1402
	1403	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
	1404	}
	1405	close OUT;
	1406
	1407	}
	1408
	1409	# restore_vocabulary_data
	1410	#
	1411	# Read phrases.3 and restore vocabulary information. Then write
[8362]	1412	# this data to the MGPP input files (pword.txt and pdata.txt) and
[3472]	1413	# (if requested) to the saved phrases file.
	1414
	1415	sub restore_vocabulary_data {
	1416	my ($self) = @_;
	1417
	1418	my $out = $self->{'outhandle'};
	1419	my $verbosity = $self->{'verbosity'};
	1420	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
	1421
	1422	my $phinddir = $self->{'phinddir'};
[27904]	1423	my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases.3');
	1424	my $vocabfile = &FileUtils::filenameConcatenate($phinddir, 'clauses.vocab');
	1425	my $datafile = &FileUtils::filenameConcatenate($phinddir, 'pdata.txt');
	1426	my $wordfile = &FileUtils::filenameConcatenate($phinddir, 'pword.txt');
[3472]	1427
	1428	my $savephrases = $self->{'savephrases'};
	1429
	1430	# 1.
	1431	# Read the vocabulary file
	1432	open(V, "<$vocabfile")
	1433	\|\| die "Cannot open $vocabfile: $!";
	1434	my @symbol;
	1435	my $i = 1;
	1436	while(<V>) {
	1437	chomp;
	1438	$symbol[$i++] = $_;
	1439	}
	1440	close V;
	1441
	1442	# 2.
	1443	# Translate phrases.3 to MGPP input files
[10253]	1444	my ($key, $text, $word, $isThesaurus, $line);
[3472]	1445	my @fields;
	1446	my $linenumber = 0;
	1447
	1448	open(IN, "<$infile");
	1449	open(DATA, ">$datafile");
	1450	open(WORD, ">$wordfile");
	1451
	1452	# Save the phrases in a separate text file
	1453	if ($savephrases) {
	1454	print $out "Saving phrases in $savephrases\n" if ($verbosity);
	1455	open(SAVE, ">$savephrases");
	1456	}
	1457
	1458	while(<IN>) {
	1459
	1460	# read the line
	1461	chomp;
	1462	$line = $_;
	1463	@fields = split(/:/, $line);
	1464
	1465	# get a phrase number for this line
	1466	$key = shift @fields;
	1467
	1468	# restore the text of the phrase
	1469	$text = shift @fields;
	1470	$text =~ s/s(\d+)/$symbol[$1]/g;
	1471	if ($text =~ / /) {
	1472	$word = "";
	1473	} elsif ($text ne 'untranslated') {
	1474	$word = $text;
	1475	}
	1476
	1477	# output the phrase data
	1478	print DATA "<Document>";
	1479	print DATA "$key:$text:", join(":", @fields), ":\n";
	1480
	1481	# output the word index search data
	1482	print WORD "<Document>$word\n";
	1483
	1484	# output the phrases to a text file
	1485	if ($savephrases) {
	1486	if ((scalar @fields) == 7) {
	1487	$isThesaurus = 1;
	1488	} else {
	1489	$isThesaurus = 0;
	1490	}
	1491	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
	1492	}
	1493	}
	1494	close IN;
	1495	close WORD;
	1496	close DATA;
	1497	close SAVE if ($savephrases);
	1498
	1499	}
	1500
	1501
	1502
	1503	# sort routines used to renumber phrases
	1504
	1505	sub numerically { $a <=> $b }
	1506
	1507	sub by_doc_frequency {
	1508	my $fa = 1;
	1509	if ($a =~ /,/) {
	1510	$fa = $a;
	1511	$fa =~ s/\d+,//;
	1512	}
	1513	my $fb = 1;
	1514	if ($b =~ /,/) {
	1515	$fb = $b;
	1516	$fb =~ s/\d+,//;
	1517	}
	1518
	1519	return ($fb <=> $fa);
	1520	}
	1521
	1522	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 28706

Download in other formats: