Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/bin/script/indexes/buildkpiK.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 11.9 KB

Rev	Line
[1971]	1	#! /user/bin/perl
	2
	3	#usage: perl buildkpiK.pl [-R] [collection] [collection] etc
	4	#
	5	#-r or -R will remove previous index files so that you may build new ones
	6	#
	7	#The program performs the following tasks:
	8	#-gathers the specified collections on the command line OR
	9	#-gathers the directories of all the collections in the collect directory, this is all
	10	# the directories apart from CVS, modelcol, . and .. which are not collections.
	11	#-It then retrieves the archive.inf file from the archive directory of each collection
	12	# to obtain the unique file ID and filepath of every document in the collection
	13	#-Then parse through each doc.gml stored in filepath to gather information
	14	#-From each file collect kea phrases.
	15	#-Determine the number of kea phrases for each document
	16	#-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
	17	# the program will write the kea phrase to the phrase index .
	18	#-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
	19	# it will increment the number of documents that the keyphrase appears in and replace that
	20	# then append the hash ID to the list of documents in the entry. If the phrase is not there
	21	# then the program will write the kea phrase to the phrase index.
	22	#-Then write document ID, no of phrase, phrase number from index followed by number of times
	23	# phrase appears into the document_keyphrase index
	24
	25	$gsdlhome = $ENV{'GSDLHOME'};
	26	$collection;
	27
	28	require "getopts.pl";
	29	&Getopts('R'); #process option arguments
	30
	31	#if option R remove all previous indexes
	32	if($opt_R == 1){ #remove indexes
	33	print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
	34	print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
	35	print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
	36	system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
	37	system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
	38	system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
	39	}
	40
	41	#collections may be specified in the command line
	42	#otherwise, all collections will be used to build
	43	#the indexes.
	44	if(@ARGV){
	45
	46	@directories = @ARGV;
	47
	48	} else { #open collect directory and get a list of all collections
	49	opendir(DIR, "$gsdlhome/collect");
	50	@directories = grep(!/(^\.\|(CVS)\|(modelcol))/, readdir(DIR));
	51	closedir(DIR);
	52	}
	53
	54	#for each collection specified to build indexes for
	55	foreach $collection (@directories){
	56
	57	my @filelist;
	58
	59	#archives.inf contains a list of unique hash ID's of each file and file paths
	60	open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
	61	or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
	62
	63	while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
	64	chomp;
	65	push(@filelist, $_);
	66	}
	67
	68	foreach $file (@filelist){ #add each document to the indexes
	69	build_index($file, $collection);
	70	}
	71	}
	72
	73	#This function opens the file in the filepath sent as an argument. From this it obtains
	74	#the kea data, and then searches for these phrases in the file, counting and storing
	75	#how many times each phrase appears. The data is then sent to function keyphrase_document
	76	#with arguments hash ID, kea phrases to build the keyphrase_document index.
	77	#The function which builds the document_keyphrase index is then passed the hash ID, the kea
	78	#phrases and the array/s which hold the number of times each phrase
	79	#appears in the document so that the data it has collected can be written to document_ keyphrase
	80	#index.
	81
	82	sub build_index {
	83
	84	my $args = shift(@_);
	85	my $collection = shift(@_);
	86	my ($ID, $filepath) = split(/\t/, $args);
	87	my $keaS = "";
	88	my @kea_phrase_counts = 0;
	89	my $text = "";
	90
	91	print STDERR "\nID: $ID\n";
	92	print STDERR "filepath: $filepath\n";
	93
	94	#open file to extract keyphrase information
	95	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
	96	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
	97
	98	#patterns to search for so that we can extract the kea information
	99	my $kea_search = ".* kea=\"([^\"]*)\"";
	100
	101	while(<FILE>){ #get kea and stem data and store
	102	chomp;
	103	$keaS = $1 if (/$kea_search/);
	104	}
	105
	106	close(FILE);
	107
	108	print STDERR "Kea: $keaS\n";
	109
	110	my @kea = split(", ", $keaS);
	111
	112	if(@kea){ #if the data exists
	113
	114	#open the filepath to the current document
	115	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
	116	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
	117
	118	while(<FILE>){ #get the text
	119	chomp;
	120	$text .= $_;
	121	}
	122
	123	#chop out all things in angled brackets
	124	$text =~ s/(<[^>]*>)//g;
	125
	126	#initilise counts
	127	for($i=0; $i<=$#kea; $i++){
	128	$kea_phrase_counts[$i] = 0;
	129	}
	130
	131
	132	#using regular expressions generated from kea-reg
	133	#count how many of each phrase appear in the document
	134	$text_copy = $text;
	135	for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
	136	my $phrase = $kea[$i];
	137	$reg = &kea_reg(split(/\s+/, $phrase));
	138	while($text_copy =~ s/$reg//i){
	139	$kea_phrase_counts[$i]++; #count the number of kea phrases
	140	}
	141	$text_copy = $text;
	142	}
	143
	144	#write data to keyphrase_document index
	145	&keyphrase_document($ID, $keaS);
	146
	147	#write data to document_keyphrase index
	148	$kea_counts = join(", ", @kea_phrase_counts);
	149	&document_keyphrase($ID, $keaS, $kea_counts);
	150
	151	} else {
	152	print STDERR "No kea data was found in file $filepath\n";
	153	}
	154
	155	}
	156
	157
	158	#returns a regular expression designed to
	159	#search for phrases in text
	160	#eg 'agris caris'
	161	# agris followed by 0 or 1 non-whitespace characters OR
	162	# followed by one or more whitespace
	163	# caris followed by 0 or 1 non-whitespace characters
	164	#modified from original by Stephen Lundy
	165
	166	sub kea_reg {
	167	$regexp = "";
	168
	169	$l = @_;
	170
	171	if ($l > 0) {
	172	$s = shift;
	173	#$regexp = "$s(\\S?)";
	174	$regexp = "$s(\\s+\|\\S?)";
	175
	176	if ($l-1 > 0) {
	177	foreach $s (@_) {
	178	$regexp .= "$s(\\s+\|\\S?)";
	179	}
	180	}
	181	}
	182
	183	return $regexp;
	184	}
	185
	186
	187	#This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
	188	#check in the keyphrase index file for each phrase and determine whether or not an entry has
	189	#been made for that phrase and an index number assigned to it. If there has not been an entry
	190	#made then an index number is assigned to the phrase and it is written to the file. This
	191	#function is called by document_keyphrase and keyphrase_document. Each line in the file has
	192	#this form:
	193	#-phrase index number:phrase
	194	#This function then returns a table of pairs of the phrases that were sent as arguments to it
	195	#{phrase => phrase index number}.
	196
	197	sub keyphrase_index_search {
	198
	199	my $phrases = shift(@_);
	200	my @phrases = split(", ", $phrases);
	201	my %table;
	202	my $index = 1;
	203	my $create_new_index = 0;
	204
	205	print STDERR "searching keyphrase index...\n";
	206
	207	#initilise table of phrases and index numbers
	208	foreach $phrase (@phrases){
	209	$table{"$phrase"} = "0";
	210	}
	211
	212	#open keyphrase index for appending data and for reading
	213	open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
	214	open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
	215	or $create_new_index = 1;
	216
	217	if($create_new_index == 0){
	218	#if the index already exists read in the phrases
	219	while(<INDEX_IN>){
	220	chomp;
	221	foreach $phrase (@phrases){
	222	if(/(\d+):$phrase/){
	223	$index = $1;
	224	$table{"$phrase"} = "$index";
	225	}
	226	}
	227	$index++; #new starting index (one + the last index)
	228	}
	229
	230	close(INDEX_IN);
	231
	232	}
	233
	234	#add new phrases to the phrase index
	235	foreach $phrase (keys %table){
	236	if($table{"$phrase"} eq "0"){
	237	print INDEX_OUT "$index:$phrase\n";
	238	$table{"$phrase"} = "$index";
	239	$index++;
	240	}
	241	}
	242
	243	close(INDEX_OUT);
	244	return %table;
	245	}
	246
	247	#This function is passed as arguments file hash ID and a list of kea phrases
	248	#that exist for that particular file. Its purpose is to write to the keyphrase_document
	249	#index a line for the document it has been sent:
	250	#-phrase index number:number of documents it appears in\|ID
	251	sub keyphrase_document{
	252
	253	my ($ID, $kea) = @_;
	254	my $text = "";
	255	my @textlist;
	256	my $create_new_index = 0;
	257
	258	print STDERR "writing to keyphrase_document.txt...\n";
	259
	260	#get table of phrases and phrase indexes
	261	my %table = keyphrase_index_search($kea);
	262
	263
	264	#open index for reading
	265	open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
	266	or $create_new_index = 1;
	267
	268	#read in document if file exists
	269	if($create_new_index == 0){
	270
	271	while(<INDEX_IN>){
	272	$text .= $_;
	273	}
	274
	275	close(INDEX_IN);
	276
	277	#split text into lines
	278	@textlist = split(/\n/, $text);
	279
	280	}
	281
	282	#open index for output
	283	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
	284
	285	if($create_new_index == 0){ #amend existing index
	286
	287	foreach $line (@textlist){
	288	foreach $phrase (keys %table){
	289	if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
	290	$index = $1;
	291	if($table{"$phrase"} eq "$index") { #if phrase exists in index
	292	$ids = $3; #get all doc IDs for that keyphrase
	293	if($ids !~ /$ID/){ #if doc ID not already included
	294	$num_docs = $2;
	295	$num_docs++; #increment number of docs
	296	$line = "$index:$num_docs$3\|$ID"; #line to append to index
	297	$table{"$phrase"} = "0";
	298	}
	299	}
	300	}
	301	}
	302	print INDEX_OUT "$line\n";
	303	}
	304	}
	305
	306	#add new phrases to the index
	307	foreach $phrase (keys %table){ #write 'phrase index:1:file ID
	308	if($table{"$phrase"} ne "0"){
	309	my $line = "$table{$phrase}:1:$ID";
	310	print INDEX_OUT "$line\n";
	311	}
	312	}
	313
	314	close(INDEX_OUT);
	315
	316	}
	317
	318	#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
	319	#that exist for that particular file and a list of the number of times each kea and/or stem
	320	#phrase appear in that document. Its purpose is to write to the document_keyphrase
	321	#index a line for the document it has been sent:
	322	#-file ID:number of phrases and/or stems appear in the document
	323	# \|pairs of 'phrase index,number of times the phrase appears in the document'
	324	sub document_keyphrase {
	325
	326	my ($ID, $keaS, $kea_c) = @_;
	327	my $text = "";
	328	my @textlist;
	329	my %phrases;
	330	my $create_new_index = 0;
	331
	332	print STDERR "writing to document_keyphrase.txt...\n";
	333
	334	#split phrase counts into arrays
	335	my @kea_counts = split(", ", $kea_c);
	336
	337	#get table of phrases and phrase indexes
	338	my %table = keyphrase_index_search($keaS);
	339
	340	#split phrases into arrays
	341	my @kea = split(", ", $keaS);
	342
	343	#build new phrases dictionary
	344	for($i=0; $i<=$#kea; $i++){
	345	my $phrase = $table{"$kea[$i]"};
	346	$phrases{"$phrase"} = "$kea_counts[$i]";
	347	}
	348
	349	my @num = keys %phrases;
	350	my $phrasenum = $#num + 1; #number of phrases in doc
	351
	352	#open index for reading
	353	open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
	354	or $create_new_index = 1;
	355
	356
	357	if($create_new_index == 0){ #index doesn't need to be created
	358
	359	while(<INDEX_IN>){
	360	$text .= $_;
	361	}
	362
	363	close(INDEX_IN);
	364
	365	#split text into lines
	366	@textlist = split(/\n/, $text);
	367
	368	}
	369
	370
	371	#must write this line to the file
	372	#'document ID:num of phrases\|phrase index, number of times phrases appears
	373	my $newline = "$ID:$phrasenum";
	374	foreach $phrase (keys %phrases){
	375	$newline .= "\|$phrase,$phrases{$phrase}";
	376	}
	377
	378	#open index for output
	379	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
	380
	381	if($create_new_index == 1){ #create a new index
	382
	383	print INDEX_OUT "$newline\n";
	384
	385	} else {
	386
	387	#if ID is already in the file write line overtop incase
	388	#someone has modified the file. Otherwise add the line
	389	#to the end of the file
	390	my $found = 0;
	391
	392	foreach $line (@textlist){
	393	if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
	394	$id = $1;
	395	if($ID eq $id){ #id is already in the file
	396	print INDEX_OUT "$newline\n"; #print line overtop
	397	$found = 1;
	398	} else {
	399	print INDEX_OUT "$line\n"; #print old line out
	400	}
	401	}
	402	}
	403
	404	print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
	405
	406	}
	407
	408	close(INDEX_OUT);
	409
	410	}
	411

Note: See TracBrowser for help on using the repository browser.

Download in other formats: