Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: main/tags/2.85/greenstone2/bin/script/indexes/buildkpiK.pl

Last change on this file was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 11.9 KB

Line
1	#! /user/bin/perl
2
3	#usage: perl buildkpiK.pl [-R] [collection] [collection] etc
4	#
5	#-r or -R will remove previous index files so that you may build new ones
6	#
7	#The program performs the following tasks:
8	#-gathers the specified collections on the command line OR
9	#-gathers the directories of all the collections in the collect directory, this is all
10	# the directories apart from CVS, modelcol, . and .. which are not collections.
11	#-It then retrieves the archive.inf file from the archive directory of each collection
12	# to obtain the unique file ID and filepath of every document in the collection
13	#-Then parse through each doc.gml stored in filepath to gather information
14	#-From each file collect kea phrases.
15	#-Determine the number of kea phrases for each document
16	#-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
17	# the program will write the kea phrase to the phrase index .
18	#-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
19	# it will increment the number of documents that the keyphrase appears in and replace that
20	# then append the hash ID to the list of documents in the entry. If the phrase is not there
21	# then the program will write the kea phrase to the phrase index.
22	#-Then write document ID, no of phrase, phrase number from index followed by number of times
23	# phrase appears into the document_keyphrase index
24
25	$gsdlhome = $ENV{'GSDLHOME'};
26	$collection;
27
28	require "getopts.pl";
29	&Getopts('R'); #process option arguments
30
31	#if option R remove all previous indexes
32	if($opt_R == 1){ #remove indexes
33	print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
34	print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
35	print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
36	system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
37	system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
38	system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
39	}
40
41	#collections may be specified in the command line
42	#otherwise, all collections will be used to build
43	#the indexes.
44	if(@ARGV){
45
46	@directories = @ARGV;
47
48	} else { #open collect directory and get a list of all collections
49	opendir(DIR, "$gsdlhome/collect");
50	@directories = grep(!/(^\.\|(CVS)\|(modelcol))/, readdir(DIR));
51	closedir(DIR);
52	}
53
54	#for each collection specified to build indexes for
55	foreach $collection (@directories){
56
57	my @filelist;
58
59	#archives.inf contains a list of unique hash ID's of each file and file paths
60	open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
61	or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
62
63	while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
64	chomp;
65	push(@filelist, $_);
66	}
67
68	foreach $file (@filelist){ #add each document to the indexes
69	build_index($file, $collection);
70	}
71	}
72
73	#This function opens the file in the filepath sent as an argument. From this it obtains
74	#the kea data, and then searches for these phrases in the file, counting and storing
75	#how many times each phrase appears. The data is then sent to function keyphrase_document
76	#with arguments hash ID, kea phrases to build the keyphrase_document index.
77	#The function which builds the document_keyphrase index is then passed the hash ID, the kea
78	#phrases and the array/s which hold the number of times each phrase
79	#appears in the document so that the data it has collected can be written to document_ keyphrase
80	#index.
81
82	sub build_index {
83
84	my $args = shift(@_);
85	my $collection = shift(@_);
86	my ($ID, $filepath) = split(/\t/, $args);
87	my $keaS = "";
88	my @kea_phrase_counts = 0;
89	my $text = "";
90
91	print STDERR "\nID: $ID\n";
92	print STDERR "filepath: $filepath\n";
93
94	#open file to extract keyphrase information
95	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
96	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
97
98	#patterns to search for so that we can extract the kea information
99	my $kea_search = ".* kea=\"([^\"]*)\"";
100
101	while(<FILE>){ #get kea and stem data and store
102	chomp;
103	$keaS = $1 if (/$kea_search/);
104	}
105
106	close(FILE);
107
108	print STDERR "Kea: $keaS\n";
109
110	my @kea = split(", ", $keaS);
111
112	if(@kea){ #if the data exists
113
114	#open the filepath to the current document
115	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
116	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
117
118	while(<FILE>){ #get the text
119	chomp;
120	$text .= $_;
121	}
122
123	#chop out all things in angled brackets
124	$text =~ s/(<[^>]*>)//g;
125
126	#initilise counts
127	for($i=0; $i<=$#kea; $i++){
128	$kea_phrase_counts[$i] = 0;
129	}
130
131
132	#using regular expressions generated from kea-reg
133	#count how many of each phrase appear in the document
134	$text_copy = $text;
135	for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
136	my $phrase = $kea[$i];
137	$reg = &kea_reg(split(/\s+/, $phrase));
138	while($text_copy =~ s/$reg//i){
139	$kea_phrase_counts[$i]++; #count the number of kea phrases
140	}
141	$text_copy = $text;
142	}
143
144	#write data to keyphrase_document index
145	&keyphrase_document($ID, $keaS);
146
147	#write data to document_keyphrase index
148	$kea_counts = join(", ", @kea_phrase_counts);
149	&document_keyphrase($ID, $keaS, $kea_counts);
150
151	} else {
152	print STDERR "No kea data was found in file $filepath\n";
153	}
154
155	}
156
157
158	#returns a regular expression designed to
159	#search for phrases in text
160	#eg 'agris caris'
161	# agris followed by 0 or 1 non-whitespace characters OR
162	# followed by one or more whitespace
163	# caris followed by 0 or 1 non-whitespace characters
164	#modified from original by Stephen Lundy
165
166	sub kea_reg {
167	$regexp = "";
168
169	$l = @_;
170
171	if ($l > 0) {
172	$s = shift;
173	#$regexp = "$s(\\S?)";
174	$regexp = "$s(\\s+\|\\S?)";
175
176	if ($l-1 > 0) {
177	foreach $s (@_) {
178	$regexp .= "$s(\\s+\|\\S?)";
179	}
180	}
181	}
182
183	return $regexp;
184	}
185
186
187	#This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
188	#check in the keyphrase index file for each phrase and determine whether or not an entry has
189	#been made for that phrase and an index number assigned to it. If there has not been an entry
190	#made then an index number is assigned to the phrase and it is written to the file. This
191	#function is called by document_keyphrase and keyphrase_document. Each line in the file has
192	#this form:
193	#-phrase index number:phrase
194	#This function then returns a table of pairs of the phrases that were sent as arguments to it
195	#{phrase => phrase index number}.
196
197	sub keyphrase_index_search {
198
199	my $phrases = shift(@_);
200	my @phrases = split(", ", $phrases);
201	my %table;
202	my $index = 1;
203	my $create_new_index = 0;
204
205	print STDERR "searching keyphrase index...\n";
206
207	#initilise table of phrases and index numbers
208	foreach $phrase (@phrases){
209	$table{"$phrase"} = "0";
210	}
211
212	#open keyphrase index for appending data and for reading
213	open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
214	open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
215	or $create_new_index = 1;
216
217	if($create_new_index == 0){
218	#if the index already exists read in the phrases
219	while(<INDEX_IN>){
220	chomp;
221	foreach $phrase (@phrases){
222	if(/(\d+):$phrase/){
223	$index = $1;
224	$table{"$phrase"} = "$index";
225	}
226	}
227	$index++; #new starting index (one + the last index)
228	}
229
230	close(INDEX_IN);
231
232	}
233
234	#add new phrases to the phrase index
235	foreach $phrase (keys %table){
236	if($table{"$phrase"} eq "0"){
237	print INDEX_OUT "$index:$phrase\n";
238	$table{"$phrase"} = "$index";
239	$index++;
240	}
241	}
242
243	close(INDEX_OUT);
244	return %table;
245	}
246
247	#This function is passed as arguments file hash ID and a list of kea phrases
248	#that exist for that particular file. Its purpose is to write to the keyphrase_document
249	#index a line for the document it has been sent:
250	#-phrase index number:number of documents it appears in\|ID
251	sub keyphrase_document{
252
253	my ($ID, $kea) = @_;
254	my $text = "";
255	my @textlist;
256	my $create_new_index = 0;
257
258	print STDERR "writing to keyphrase_document.txt...\n";
259
260	#get table of phrases and phrase indexes
261	my %table = keyphrase_index_search($kea);
262
263
264	#open index for reading
265	open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
266	or $create_new_index = 1;
267
268	#read in document if file exists
269	if($create_new_index == 0){
270
271	while(<INDEX_IN>){
272	$text .= $_;
273	}
274
275	close(INDEX_IN);
276
277	#split text into lines
278	@textlist = split(/\n/, $text);
279
280	}
281
282	#open index for output
283	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
284
285	if($create_new_index == 0){ #amend existing index
286
287	foreach $line (@textlist){
288	foreach $phrase (keys %table){
289	if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
290	$index = $1;
291	if($table{"$phrase"} eq "$index") { #if phrase exists in index
292	$ids = $3; #get all doc IDs for that keyphrase
293	if($ids !~ /$ID/){ #if doc ID not already included
294	$num_docs = $2;
295	$num_docs++; #increment number of docs
296	$line = "$index:$num_docs$3\|$ID"; #line to append to index
297	$table{"$phrase"} = "0";
298	}
299	}
300	}
301	}
302	print INDEX_OUT "$line\n";
303	}
304	}
305
306	#add new phrases to the index
307	foreach $phrase (keys %table){ #write 'phrase index:1:file ID
308	if($table{"$phrase"} ne "0"){
309	my $line = "$table{$phrase}:1:$ID";
310	print INDEX_OUT "$line\n";
311	}
312	}
313
314	close(INDEX_OUT);
315
316	}
317
318	#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
319	#that exist for that particular file and a list of the number of times each kea and/or stem
320	#phrase appear in that document. Its purpose is to write to the document_keyphrase
321	#index a line for the document it has been sent:
322	#-file ID:number of phrases and/or stems appear in the document
323	# \|pairs of 'phrase index,number of times the phrase appears in the document'
324	sub document_keyphrase {
325
326	my ($ID, $keaS, $kea_c) = @_;
327	my $text = "";
328	my @textlist;
329	my %phrases;
330	my $create_new_index = 0;
331
332	print STDERR "writing to document_keyphrase.txt...\n";
333
334	#split phrase counts into arrays
335	my @kea_counts = split(", ", $kea_c);
336
337	#get table of phrases and phrase indexes
338	my %table = keyphrase_index_search($keaS);
339
340	#split phrases into arrays
341	my @kea = split(", ", $keaS);
342
343	#build new phrases dictionary
344	for($i=0; $i<=$#kea; $i++){
345	my $phrase = $table{"$kea[$i]"};
346	$phrases{"$phrase"} = "$kea_counts[$i]";
347	}
348
349	my @num = keys %phrases;
350	my $phrasenum = $#num + 1; #number of phrases in doc
351
352	#open index for reading
353	open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
354	or $create_new_index = 1;
355
356
357	if($create_new_index == 0){ #index doesn't need to be created
358
359	while(<INDEX_IN>){
360	$text .= $_;
361	}
362
363	close(INDEX_IN);
364
365	#split text into lines
366	@textlist = split(/\n/, $text);
367
368	}
369
370
371	#must write this line to the file
372	#'document ID:num of phrases\|phrase index, number of times phrases appears
373	my $newline = "$ID:$phrasenum";
374	foreach $phrase (keys %phrases){
375	$newline .= "\|$phrase,$phrases{$phrase}";
376	}
377
378	#open index for output
379	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
380
381	if($create_new_index == 1){ #create a new index
382
383	print INDEX_OUT "$newline\n";
384
385	} else {
386
387	#if ID is already in the file write line overtop incase
388	#someone has modified the file. Otherwise add the line
389	#to the end of the file
390	my $found = 0;
391
392	foreach $line (@textlist){
393	if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
394	$id = $1;
395	if($ID eq $id){ #id is already in the file
396	print INDEX_OUT "$newline\n"; #print line overtop
397	$found = 1;
398	} else {
399	print INDEX_OUT "$line\n"; #print old line out
400	}
401	}
402	}
403
404	print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
405
406	}
407
408	close(INDEX_OUT);
409
410	}
411

Note: See TracBrowser for help on using the repository browser.

Download in other formats: