Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 1646

Last change on this file since 1646 was 1646, checked in by paynter, 24 years ago
Arguments for setting suffix program parameters.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.0 KB

Rev	Line
[1643]	1	###########################################################################
	2	#
	3	# phind.pm -- the Phind classifier
	4	#
	5	# Copyright (C) 2000 Gordon W. Paynter
	6	# Copyright (C) 2000 New Zealand Digital Library Project
	7	#
	8	#
	9	# A component of the Greenstone digital library software
	10	# from the New Zealand Digital Library Project at the
	11	# University of Waikato, New Zealand.
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# The phind clasifier plugin
	30	#
	31	# options are:
	32	# title=Title The title field for this classification
	33	# text=fields The text used to build the phrase hierarchy
	34	# phindexdir=directory Location of phind index files
	35	# verbosity=num Control amount of output
[1646]	36	# untidy=true Do not clean up intermediate files
	37	# suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword)
	38	# suffixsize=num Number of symbols available to suffix program
[1643]	39
	40
	41	# How a classifier works.
	42	#
	43	# When a classifier is requested in the collect.cfg file, buildcol creates a
	44	# new classifier object (such as the one defined in theis file) and later
	45	# passes each document object to the classifier in turn. Four functions are
	46	# used:
	47	#
	48	# 1. "new" is called before the documents are processed to set up the
	49	# classifier.
	50	#
	51	# 2. "init" is called after buildcol.pl has created the indexes etc but
	52	# before the documents are classified in order that the classifier might
	53	# set any varioables it requiers, etc.
	54	#
	55	# 3. "classify" is called once for each document object. The classifier
	56	# "classifies" each document and updates its local data accordingly.
	57	#
	58	# 4. "get_classify_info" is called after every document has been
	59	# classified. It collates the information about the documents and
	60	# stores a reference to the classifier so that Greenstone can later
	61	# display it.
	62
	63
	64	package phind;
	65
	66	use BasClas;
	67	use util;
	68
	69	sub BEGIN {
	70	@ISA = ('BasClas');
	71	}
	72
	73	# Define delimiter symbols - this should be abstracted out someplace
	74	my $colstart = "COLLECTIONSTART";
	75	my $colend = "COLLECTIONEND";
	76	my $doclimit = "DOCUMENTLIMIT";
	77	my $senlimit = "SENTENCELIMIT";
	78	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
	79
	80	# Create a new phind browser based on the options in collect.cfg
	81
	82	sub new {
	83	my ($class, @options) = @_;
	84	my $self = new BasClas ($class, @_);
	85	my $out = pop @options;
	86
	87	# Phind installation check
	88	# The phind phrase browser is research software and is not installed
	89	# by defualt. If the user attepts to use it we warn them that it's a
	90	# bit dodgy, then tell them how to install it. If they can do that
	91	# and get all the files in place, then we let them proceed.
	92
	93	print $out "The Phind classifier for Greenstone.\n";
	94	print $out "Checking the phind phrase browser requirements...\n";
	95
	96	# Make sure we're not in windows
	97	if ($ENV{'GSDLOS'} =~ /windows/i) {
	98	print STDERR "Phind currently only works under Unix";
	99	exit(1);
	100	}
	101
	102	# Ensure the Phind generate scripts are in place
	103	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
	104	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
	105
	106	if (!(-e $file1)) {
	107	print STDERR "The phind \"suffix\" program is not installed. ";
	108	print STDERR "To install it, change to the directory\n";
	109	print STDERR " $src\n";
	110	print STDERR "and type \"make install-phind\".\n\n";
	111	exit(1);
	112	}
	113
	114	# Ensure the Phind CGI script is in place
	115	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
	116	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
	117
	118	if (!(-e $file1)) {
	119	print STDERR "The phind CGI program is not installed. ";
	120	print STDERR "To install it, change to the directory\n";
	121	print STDERR " $src\n";
	122	print STDERR "and type \"make install-phind\".\n\n";
	123	exit(1);
	124	}
	125
	126	# Ensure the Phind Java applet is in place
	127	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
	128	$file1 = &util::filename_cat($src, "Phind.class");
	129
	130	if (!(-e $file1)) {
	131	print STDERR "The phind Java classes are not compiled. ";
	132	print STDERR "To compile them, change to the directory\n";
	133	print STDERR " $src\n";
	134	print STDERR "and use your Java compiler to compile Phind.java.\n";
	135	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
	136	exit(1);
	137	}
	138
	139
	140	# The installation appears OK - set up the classifier
	141	my $collection = $ENV{'GSDLCOLLECTION'};
	142	my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
	143	my $language = "english";
	144
	145	my $title = "Topic";
	146	my $indexes = "section:Title,section:text";
	147
[1646]	148	my $suffixmode = 1;
	149	my $suffixsize = 40000000;
	150
[1643]	151	my $verbosity = 2;
	152	my $untidy = 0;
	153
	154	# parse the options
	155	foreach $option (@options) {
	156
	157	if ($option =~ /^text=(.*)$/i) {
	158	$indexes = $1;
	159	} elsif ($option =~ /^title=(.*)$/i) {
	160	$title = $1;
	161	} elsif ($option =~ /^phindexdir=(.*)$/i) {
	162	$phindexdir = $1;
[1646]	163	} elsif ($option =~ /^suffixsize=(.*)$/i) {
	164	$suffixsize = $1;
	165	} elsif ($option =~ /^suffixmode=(.*)$/i) {
	166	$suffixmode = $1;
[1643]	167	} elsif ($option =~ /^verbosity=(.*)$/i) {
	168	$verbosity = $1;
	169	} elsif ($option =~ /^untidy/i) {
	170	$untidy = 1;
	171	}
	172	}
	173
	174
	175	$self->{'collection'} = $collection;
	176	$self->{'title'} = $title;
	177	$self->{'indexes'} = $indexes;
[1646]	178
	179	$self->{'suffixmode'} = $suffixmode;
	180	$self->{'suffixsize'} = $suffixsize;
	181
[1643]	182	$self->{'verbosity'} = $verbosity;
	183	$self->{'untidy'} = $untidy;
	184
	185	# limit languages
	186	$language =~ s/,/\\|/g;
	187	$self->{'language_exp'} = $language;
	188	$self->{'delimiter'} = $delimiter;
	189
	190	# reset phindex directory
	191	if (-e "$phindexdir") {
	192	&util::rm_r("$phindexdir");
	193	}
	194	&util::mk_dir("$phindexdir");
	195	$self->{'phindexdir'} = $phindexdir;
	196
	197	return bless $self, $class;
	198	}
	199
	200
	201	# Initialise the phind classifier
	202
	203	sub init {
	204	my $self = shift (@_);
	205
	206	# open filehandles for documents and text
	207	my $phindexdir = $self->{'phindexdir'};
	208
	209	my $clausefile = &util::filename_cat("$phindexdir", "clauses");
	210	&util::rm($clausefile) if (-e $clausefile);
	211	open(TEXT, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
	212	$self->{'txthandle'} = TEXT;
	213
	214	my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
	215	&util::rm($docfile) if (-e $docfile);
	216	open(DOCS, ">$docfile") \|\| die "Cannot open $docfile: $!";
	217	$self->{'dochandle'} = DOCS;
	218
	219	}
	220
	221	# Classify each document.
	222	#
	223	# Each document is passed here in turn. The classifier extracts the
	224	# text of each and stores it in the clauses file. Document details are
	225	# stored in the docs.txt file.
	226
	227	sub classify {
	228	my $self = shift (@_);
	229	my ($doc_obj) = @_;
	230
	231	my $verbosity = $self->{'verbosity'};
	232	my $top_section = $doc_obj->get_top_section();
	233
	234	my $title = $doc_obj->get_metadata_element ($top_section, "Title");
	235	print "process: $title\n" if ($verbosity > 2);
	236
	237
	238	# only consider english-language files
	239	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
	240	my $phrlanguage = $self->{'language_exp'};
	241	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
	242
	243	# record this file
	244	my $total++;
	245	print "file $total: $file\n" if ($self->{'$verbosity'});
	246
	247
	248	# Store document details
	249	my $OID = $doc_obj->get_OID();
	250	$OID = "NULL" unless defined $OID;
	251	my $dochandle = $self->{'dochandle'};
	252	print $dochandle "<Document>\t$OID\t$title\n";
	253
	254	# Store the text occuring in this object
	255
	256	# output the document delimiter
	257	my $txthandle = $self->{'txthandle'};
	258	print $txthandle "$doclimit\n";
	259
	260	# iterarate over the required indexes and store their text
	261	my $indexes = $self->{'indexes'};
	262	my $text = "";
	263	my ($part, $level, $field, $section, $data);
	264
	265	foreach $part (split(/,/, $indexes)) {
	266
	267	# Each field has a level and a data element ((e.g. document:Title)
	268	($level, $field) = split(/:/, $part);
	269	die unless ($level && $field);
	270
	271	# Extract the text from every section
	272	# (In phind, document:text and section:text are equivalent)
	273	if ($field eq "text") {
	274	$data = "";
	275	$section = $doc_obj->get_top_section();
	276	while (defined($section)) {
	277	$data .= $doc_obj->get_text($section) . "\n";
	278	$section = $doc_obj->get_next_section($section);
	279	}
	280	$text .= convert_gml_to_tokens($data) . "\n";
	281	}
	282
	283	# Extract a metadata field from a document
	284	elsif ($level eq "document") {
	285	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
	286	$text .= convert_gml_to_tokens($data) . "\n";
	287	}
	288
	289	# Extract metadata from every section in a document
	290	elsif ($level eq "section") {
	291	$data = "";
	292	$section = $doc_obj->get_top_section();
	293	while (defined($section)) {
	294	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
	295	$section = $doc_obj->get_next_section($section);
	296	}
	297	$text .= convert_gml_to_tokens($data) . "\n";
	298	}
	299
	300	# Some sort of specification which I don't understand
	301	else {
	302	die "Unknown level ($level) in phind index ($part)\n";
	303	}
	304
	305	}
	306
	307	# output the text
	308	$text =~ tr/\n//s;
	309	print $txthandle "$text";
	310
	311	}
	312
	313
	314
	315	# Construct the classifier from the information already gathered
	316	#
	317	# When get_classify_info is called, the clauses and docs.txt files have
	318	# already been constructed in the phindex directory. This function will
	319	# translate them into compressed, indexed MGPP files that can be read by
	320	# the phindcgi script. It will also register our classifier so that it
	321	# shows up in thenavigation bar.
	322
	323	sub get_classify_info {
	324	my $self = shift (@_);
	325
	326	my $verbosity = $self->{'verbosity'};
	327	my $phindexdir = $self->{'phindexdir'};
	328	my $language = "english";
	329
[1645]	330	if ($verbosity) {
	331	print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
	332	}
	333
[1643]	334	# Construct phind indexes
[1646]	335	my $suffixmode = $self->{'suffixmode'};
	336	my $suffixsize = $self->{'suffixsize'};
[1643]	337	my ($command, $status);
	338
	339	# Generate the vocabulary, symbol statistics, and numbers file
	340	# from the clauses file
	341	print "\nExtracting vocabulary and statistics\n" if $verbosity;
	342	&extract_vocabulary($phindexdir, $language, $verbosity);
	343
	344	# Use the suffix program to generate the phindex/phrases file
[1645]	345	print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[1646]	346	&execute("suffix $phindexdir $suffixsize $suffixmode", $verbosity);
[1643]	347
	348	# Create the phrase file and put phrase numbers in phindex/phrases
	349	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
	350	&renumber_phrases("$phindexdir", $verbosity);
	351
	352	# Create the mg phrase database
	353	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
	354	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
	355	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
	356
	357	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
	358	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
	359	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
	360	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
	361
	362	print "\nCreating phrase databases\n";
	363	my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
	364	my $mg_stem = "pdata";
	365
	366	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
	367	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
	368	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
	369
	370	# create the mg index of words
	371	print "\nCreating word-level search indexes\n";
	372	$mg_input = &util::filename_cat($phindexdir, "pword.txt");
	373	$mg_stem = "pword";
	374
	375	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
	376	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
	377	&execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
	378	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
	379	&execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
	380	&execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
	381
	382	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
	383	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
	384	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
	385
	386	# create the mg document information database
	387	print "\nCreating document information databases\n";
	388	$mg_input = &util::filename_cat($phindexdir, "docs.txt");
	389	$mg_stem = "docs";
	390
	391	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
	392	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
	393	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
	394
	395
	396	# Tidy up stray files
	397	if (!$untidy) {
[1645]	398	print "\nCleaning up\n" if ($verbosity > 2);
[1643]	399	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
	400	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
	401	"$phindexdir/phrases", "$phindexdir/docs.txt",
	402	"$phindexdir/pdata.txt", "$phindexdir/pword.txt");
	403	my $outfile = 1;
	404	while (-e "$phindexdir/outPhrase.$outfile") {
	405	&util::rm("$phindexdir/outPhrase.$outfile");
	406	$outfile++;
	407	}
	408	}
	409
	410
	411	# Insert the classifier into.... what?
	412	my $collection = $self->{'collection'};
	413	my $url = "library?a=p&p=phind&c=$collection";
	414
	415	my %classifyinfo = ('thistype'=>'Invisible',
	416	'childtype'=>'Phind',
	417	'Title'=>$self->{'title'},
	418	'contains'=>[]);
	419
	420	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
	421	return \%classifyinfo;
	422	}
	423
	424
	425
	426	sub convert_gml_to_tokens {
	427
	428	$_ = shift @_;
	429
	430	# FIRST, remove GML tags
	431
	432	# Replace all whitespace with a simple space
	433	s/\s+/ /gs;
	434
	435	# Remove everything that is in a tag
	436	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	437	s/\s<br>\s/ LINEBREAK /isg;
	438	s/<[^>]*>/ /sg;
	439
	440	# Now we have the text, but it may contain HTML
	441	# elements coded as > etc. Remove these tags.
	442	s/</</sg;
	443	s/>/>/sg;
	444
	445	s/\s+/ /sg;
	446	s/\s<p>\s/ PARAGRAPHBREAK /isg;
	447	s/\s<br>\s/ LINEBREAK /isg;
	448	s/<[^>]*>/ /sg;
	449
	450	# remove & and other miscellaneous markup tags
	451	s/&/&/sg;
	452	s/</</sg;
	453	s/>/>/sg;
	454	s/&/&/sg;
	455
	456	# replace<p> and <br> placeholders with carriage returns
	457	s/PARAGRAPHBREAK/\n/sg;
	458	s/LINEBREAK/\n/sg;
	459
	460
	461	# Exceptional punctuation
	462	#
	463	# We make special cases of some punctuation
	464
	465	# remove any apostrophe that indicates omitted letters
	466	s/(\w+)\'(\w*\s)/ $1$2 /g;
	467
	468	# remove period that appears in a person's initals
	469	s/\s([A-Z])\./ $1 /g;
	470
	471	# replace hyphens in hypheanted words and names with a space
	472	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
	473
	474
	475	# Convert the remaining text to "clause format",
	476	# This means removing all excess punctuation and garbage text,
	477	# normalising valid punctuation to fullstops and commas,
	478	# then putting one cluse on each line.
	479
	480	# Insert newline when the end of a sentence is detected
	481	# (delimter is: "[\.\?\!]\s")
	482	s/\s*[\.\?\!]\s+/\n/g;
	483
	484	# split numbers after four digits
	485	s/(\d\d\d\d)/$1 /g;
	486
	487	# split words after 32 characters
	488
	489	# squash repeated punctuation
	490	tr/A-Za-z0-9 //cs;
	491
	492	# save email addresses
	493	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
	494
	495	# normalise clause breaks (mostly punctuation symbols) to commas
	496	s/[^A-Za-z0-9 \n]+/ , /g;
	497
	498	# Remove repeated commas, and replace with newline
	499	s/\s*,[, ]+/\n/g;
	500
	501	# remove extra whitespace
	502	s/ +/ /sg;
	503	s/^\s+//mg;
	504	s/\s*$/\n/mg;
	505
	506	# remove lines that contain one word or less
	507	s/^\w*$//mg;
	508	s/^\s*$//mg;
	509	tr/\n//s;
	510
	511	return $_;
	512	}
	513
	514
	515	# Execute a system command
	516
	517	sub execute {
	518	my ($command, $verbosity) = @_;
[1645]	519	print "Executing: $command\n" if ($verbosity > 2);
[1643]	520	my $status = system($command);
	521	if ($status != 0) {
	522	print STDERR "phindgen.pl - Error executing $command: $!\n";
	523	exit($status);
	524	}
	525	}
	526
	527
	528	# Generate the vocabulary, symbol statistics, and numbers file from the
	529	# clauses file. This is legacy code, so is a bit messy and probably wont
	530	# run under windows.
	531
	532	sub extract_vocabulary {
	533	my ($phindex_dir, $language, $verbosity) = @_;
	534
	535	my ($w, $l, $line, $word);
	536
	537	my ($first_delimiter, $last_delimiter,
	538	$first_stopword, $last_stopword,
	539	$first_extractword, $last_extractword,
	540	$first_contentword, $last_contentword,
	541	$phrasedelimiter);
	542
	543	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
	544
	545
	546	my %symbol;
	547	my (%freq);
	548
	549	print "Calculating vocabulary\n" if ($verbosity > 1);
	550
	551	# Read and store the stopwords
	552	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
	553	my %stopwords;
	554	foreach my $w (split(/\s+/, $words)) {
	555	$l = lc($w);
	556	$stopwords{$l} = $w;
	557	}
	558
	559	# Read and store the thesaurus terms
	560	$use_thesaurus = 0;
	561	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
	562	"$language", "agrovoc.lex");
	563	if (-e "$lex_file") {
	564	open(TH, "<$lex_file");
	565	while(<TH>) {
	566	s/^\d+ //;
	567	s/\(.*\)//;
	568	foreach my $w (split(/\s+/, $_)) {
	569	$thesaurus{lc($w)} = $w;
	570	}
	571	}
	572	close TH;
	573	$use_thesaurus = 1;
	574	}
	575
	576	# Read words in the text and count occurences
	577	open(TXT, "<$phindex_dir/clauses");
	578	my @words;
	579
	580	while(<TXT>) {
	581	$line = $_;
	582	next unless ($line =~ /./);
	583
	584	@words = split(/\s+/, $line);
	585	foreach $w (@words) {
	586	$l = lc($w);
	587	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
	588	$freq{$w}++;
	589	}
	590	$freq{$senlimit}++;
	591	}
	592
	593	# Calculate the "best" form of each word
	594	my (%bestform, %totalfreq, %bestfreq);
	595
	596	foreach $w (sort (keys %freq)) {
	597	$l = lc($w);
	598
	599	# totalfreq is the number of times a term appears in any form
	600	$totalfreq{$l} += $freq{$w};
	601
	602	if (defined $stopwords{$l}) {
	603	$bestform{$l} = $stopwords{$l};
	604
	605	} elsif (defined $thesaurus{$l}) {
	606	$bestform{$l} = $thesaurus{$l};
	607
	608	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
	609	$bestfreq{$l} = $freq{$w};
	610	$bestform{$l} = $w;
	611	}
	612	}
	613
	614	undef %freq;
	615	undef %bestfreq;
	616
	617
	618	# Assign symbol numbers to tokens
	619	my $nextsymbol = 1;
	620	my (@vocab);
	621
	622	# Delimiters
	623	$first_delimiter = 1;
	624
	625	foreach $word (@delimiters) {
	626
	627	$word = lc($word);
	628	$bestform{$word} = uc($word);
	629	$vocab[$nextsymbol] = $word;
	630	$symbol{$word} = $nextsymbol;
	631	$nextsymbol++;
	632	}
	633	$last_delimiter = $nextsymbol - 1;
	634
	635	# Stopwords
	636	$first_stopword = $nextsymbol;
	637
	638	foreach my $word (sort keys %stopwords) {
	639
	640	# don't incluse stopword unless it occurs in the text
	641	$word = lc($word);
	642	next unless ($totalfreq{$word});
	643	next if ($symbol{$word});
	644
	645	$vocab[$nextsymbol] = $word;
	646	$symbol{$word} = $nextsymbol;
	647	$nextsymbol++;
	648	}
	649	$last_stopword = $nextsymbol - 1;
	650	$first_contentword = $nextsymbol;
	651
	652	# Thesaurus terms
	653	if ($use_thesaurus) {
	654	$first_thesaurusword = $nextsymbol;
	655
	656	foreach my $word (sort keys %thesaurus) {
	657
	658	$word = lc($word);
	659	next if ($symbol{$word});
	660	$bestform{$word} = $thesaurus{$word};
	661
	662	$vocab[$nextsymbol] = $word;
	663	$symbol{$word} = $nextsymbol;
	664	$nextsymbol++;
	665
	666	}
	667	$last_thesaurusword = $nextsymbol - 1;
	668	}
	669
	670	# Other content words
	671	$first_extractword = $nextsymbol;
	672
	673	foreach my $word (sort (keys %bestform)) {
	674
	675	next if ($symbol{$word});
	676
	677	$vocab[$nextsymbol] = $word;
	678	$symbol{$word} = $nextsymbol;
	679	$nextsymbol++;
	680	}
	681	$last_extractword = $nextsymbol - 1;
	682	$last_contentword = $nextsymbol - 1;
	683
	684
	685	# Outut the words
	686	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
	687	open(VOC, ">$phindex_dir/clauses.vocab");
	688
	689	for (my $i = 1; $i < $nextsymbol; $i++) {
	690	$w = $vocab[$i];
	691
	692	print VOC "$bestform{$w}\n";
	693	$totalfreq{$w} = 0 unless ($totalfreq{$w});
	694	}
	695	close VOC;
	696
	697
	698	# Output statistics about the vocablary
	699	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
	700	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
	701	open(STAT, ">$phindex_dir/clauses.stats")
	702	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
	703
	704	print STAT "first_delimiter $first_delimiter\n";
	705	print STAT "last_delimiter $last_delimiter\n";
	706	print STAT "first_stopword $first_stopword\n";
	707	print STAT "last_stopword $last_stopword\n";
	708	if ($use_thesaurus) {
	709	print STAT "first_thesaurusword $first_thesaurusword\n";
	710	print STAT "last_thesaurusword $last_thesaurusword\n";
	711	}
	712	print STAT "first_extractword $first_extractword\n";
	713	print STAT "last_extractword $last_extractword\n";
	714	print STAT "first_contentword $first_contentword\n";
	715	print STAT "last_contentword $last_contentword\n";
	716	print STAT "first_symbol $first_delimiter\n";
	717	print STAT "last_symbol $last_contentword\n";
	718	print STAT "first_word $first_stopword\n";
	719	print STAT "last_word $last_contentword\n";
	720	close STAT;
	721
	722	undef @vocab;
	723
	724
	725	# Save text as symbol numbers
	726	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
	727
	728	open(TXT, "<$phindex_dir/clauses");
	729	open(NUM, ">$phindex_dir/clauses.numbers");
	730
	731	$phrasedelimiter = $symbol{lc($senlimit)};
	732	print NUM "$symbol{lc($colstart)}\n";
	733
	734	# set up the special symbols that delimit documents and sentences
	735	while(<TXT>) {
	736
	737	# split sentence into a list of tokens
	738	$line = $_;
	739	next unless ($line =~ /./);
	740	@words = split(/\s+/, $line);
	741
	742	# output one token at a time
	743	foreach $word (@words) {
	744	$word = lc($word);
	745	print NUM "$symbol{$word}\n";
	746	}
	747
	748	# output phrase delimiter
	749	print NUM "$phrasedelimiter\n";
	750	}
	751
	752	print NUM "$symbol{lc($colend)}\n";
	753
	754	}
	755
	756
	757	# Prepare the phrases file to be input to mgpp.
	758	# This means renumbering the phrases in order of decreasing frequency.
	759
	760
	761	# This is legacy code, and a little ugly, and may be unix-specific
	762	# (particularly the sort command).
	763
	764	sub renumber_phrases {
	765	my ($phindex_dir, $verbosity) = @_;
	766
	767	# Sort the phrases into order of increasing frequency
	768	# This means the expansions will be sorted correctly later on.
	769	print "Sorting phrases into freq order\n" if ($verbosity);
	770	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
	771
	772	my @symbol;
	773
	774	# Read the vocabulary
	775	print "Reading the vocabulary\n" if ($verbosity);
	776	open(V, "<$phindex_dir/clauses.vocab")
	777	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
	778
	779	my $i = 1;
	780	while(<V>) {
	781	chomp;
	782	$symbol[$i++] = $_;
	783	}
	784
	785	# Create file for phrase data
	786	#
	787	# The phrases file looks something like this
	788	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
	789	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
	790	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
	791	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
	792
	793	# The first field on each line is a unique phrase identifier.
	794	# We need to calculate phrase numbers for each phrase
	795	print "Calculate phrase numbers\n" if ($verbosity);
	796
	797	my %phrasenumber;
	798	my $nextphrase = 1;
	799	my ($line);
	800
	801	open(IN, "<$phindex_dir/phrases");
	802	while(<IN>) {
	803
	804	# read the line
	805	chomp;
	806	$line = $_;
	807
	808	# we're only interested in the first field
	809	$line =~ s/:.*//;
	810
	811	# get a phrase number for this line
	812	$phrasenumber{$line} = $nextphrase;
	813	$nextphrase++;
	814	}
	815
	816
	817	# Now we create a new phrase file using phrase numbers, not the old IDs.
	818	print "Format phrase data for MGPP\n" if ($verbosity);
	819
	820	open(IN, "<$phindex_dir/phrases");
	821	open(DATA, ">$phindex_dir/pdata.txt");
	822	open(IDX, ">$phindex_dir/pword.txt");
	823
	824	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
	825	my @fields;
	826	my @documents;
	827	my (@newexp, $k, $n);
	828
	829	my $linenumber = 0;
	830
	831	while(<IN>) {
	832
	833	# read the line
	834	chomp;
	835	$line = $_;
	836	@fields = split(/:/, $line);
	837
	838	# get a phrase number for this line
	839	$key = shift @fields;
	840	die unless (defined($phrasenumber{$key}));
	841	$num = $phrasenumber{$key};
	842
	843	# get the text of the phrase
	844	$text = shift @fields;
	845	$text =~ s/s(\d+)/$symbol[$1]/g;
	846	if ($text =~ / /) {
	847	$word = "";
	848	} else {
	849	$word = $text;
	850	}
	851
	852	$linenumber++;
	853	if ($linenumber % 1000 == 0) {
	854	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
	855	}
	856	print "$num: $key\t($text)\n" if ($verbosity > 3);
	857
	858	# get the phrase frequency
	859	$tf = shift @fields;
	860
	861	# get the number of expansions
	862	$countexp = shift @fields;
	863
	864	# get the expansions and convert them into phrase numbers
	865	$expansions = shift @fields;
	866	@newexp = ();
	867	foreach $k (split(/,/, $expansions)) {
	868	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
	869	$n = $phrasenumber{$k};
	870	push @newexp, $n;
	871	}
	872	@newexp = sort numerically @newexp;
	873
	874	# get the number of documents
	875	$countdocs = shift @fields;
	876
	877	# get the documents
	878	$documents = shift @fields;
	879	$documents =~ s/d//g;
	880	@documents = split(/;/, $documents);
	881	@documents = sort by_frequency @documents;
	882
	883	# output the phrase data
	884	print DATA "<Document>";
	885	print DATA "$num:$text:$tf:$countexp:$countdocs:";
	886	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
	887
	888	# output the word index search data
	889	print IDX "<Document>$word\n";
	890
	891
	892	}
	893	}
	894
	895	# sort routines used to renumber phrases
	896
	897	sub numerically { $a <=> $b }
	898
	899	sub by_frequency {
	900	my $fa = 1;
	901	if ($a =~ /,/) {
	902	$fa = $a;
	903	$fa =~ s/\d+,//;
	904	}
	905	my $fb = 1;
	906	if ($b =~ /,/) {
	907	$fb = $b;
	908	$fb =~ s/\d+,//;
	909	}
	910
	911	return ($fb <=> $fa);
	912	}
	913
	914
	915	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: