source: trunk/gsdl/bin/script/indexes/buildkpi.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1#! /user/bin/perl
2
3#usage: perl buildkpi.pl [-R] [collection] [collection] etc
4#
5#-r or -R will remove previous index files so that you may build new ones
6#
7#The program performs the following tasks:
8#-gathers the specified collections on the command line OR
9#-gathers the directories of all the collections in the collect directory, this is all
10# the directories apart from CVS, modelcol, . and .. which are not collections.
11#-It then retrieves the archive.inf file from the archive directory of each collection
12# to obtain the unique file ID and filepath of every document in the collection
13#-Then parse through each doc.gml stored in filepath to gather information
14#-From each file collect kea phrases and/or stems
15#-Determine the number of kea phrases and stems for each document
16#-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
17# the program will write the kea phrase to the phrase index .
18#-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
19# it will increment the number of documents that the keyphrase appears in and replace that
20# then append the hash ID to the list of documents in the entry. If the phrase is not there
21# then the program will write the kea phrase to the phrase index.
22#-Then write document ID, no of phrase, phrase number from index followed by number of times
23# phrase appears into the document_keyphrase index
24
25$gsdlhome = $ENV{'GSDLHOME'};
26
27require "getopts.pl";
28&Getopts('R'); #process option arguments
29
30
31#collections may be specified in the command line
32#otherwise, all collections will be used to build
33#the indexes.
34if(@ARGV){
35
36 @directories = @ARGV;
37
38} else { #open collect directory and get a list of all collections
39 opendir(DIR, "$gsdlhome/collect");
40 @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
41 closedir(DIR);
42}
43
44
45#directory to store indexes in using collection names
46$dirname = "";
47foreach $collection (sort @directories){
48 $dirname .= $collection."_";
49}
50$dirname .= "indexes";
51print STDERR "directory name: $dirname\n";
52
53#if option R remove all previous indexes
54if($opt_R == 1){ #remove indexes
55 print STDERR "\nremoving $gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt\n";
56 print STDERR "removing $gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt\n";
57 print STDERR "removing $gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt\n";
58 system("rm $gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt");
59 system("rm $gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt");
60 system("rm $gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt");
61}
62
63#create the new directory, display an error message if the directory
64#already exists.
65`mkdir --verbose $gsdlhome/bin/script/indexes/$dirname`;
66
67#for each collection specified to build indexes for
68foreach $collection (@directories){
69
70 print STDERR "\nBUILDING INDEXES FOR COLLECTION $collection\n\n";
71
72 my @filelist;
73
74 #archives.inf contains a list of unique hash ID's of each file and file paths
75 open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
76 or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
77
78 while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
79 chomp;
80 push(@filelist, $_);
81 }
82
83 foreach $file (@filelist){ #add each document to the indexes
84 build_index($file, $collection);
85 }
86}
87
88#This function opens the file in the filepath sent as an argument. From this it obtains
89#the kea and/or stem data, and then searches for these phrases in the file, counting and storing
90#how many times each phrase appears. The data is then sent to function keyphrase_document
91#with arguments hash ID, kea phrases and stem phrases to build the keyphrase_document index.
92#The function which builds the document_keyphrase index is then passed the hash ID, the kea
93#phrases and/or the stemmed phrases and the array/s which hold the number of times each phrase
94#appears in the document so that the data it has collected can be written to document_ keyphrase
95#index.
96
97sub build_index {
98
99 my $args = shift(@_);
100 my $collection = shift(@_);
101 my ($ID, $filepath) = split(/\t/, $args);
102 my $keaS = "";
103 my $stemsS = "";
104 my @kea_phrase_counts = 0;
105 my @stem_phrase_counts = 0;
106 my $text = "";
107
108 print STDERR "\nID: $ID\n";
109 print STDERR "filepath: $filepath\n";
110
111 #open file to extract keyphrase information
112 open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
113 or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
114
115 #patterns to search for so that we can extract the kea information
116 my $kea_search = ".* kea=\"([^\"]*)\"";
117 my $stem_search = "stems=\"([^\"]*)\"";
118
119 while(<FILE>){ #get kea and stem data and store
120 chomp;
121 $keaS = $1 if (/$kea_search/);
122 $stemsS = $1 if (/$stem_search/);
123 }
124
125 close(FILE);
126
127 print STDERR "Kea: $keaS\n";
128 print STDERR "stems: $stemsS\n";
129
130 my @kea = split(", ", $keaS);
131 my @stems = split(", ", $stemsS);
132
133 if(@kea && @stems){ #if the data exists
134
135 #open the filepath to the current document
136 open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
137 or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
138
139 while(<FILE>){ #get the text
140 chomp;
141 $text .= $_;
142 }
143
144 #chop out all things in angled brackets
145 $text =~ s/(<[^>]*>)//g;
146
147 #initilise counts
148 for($i=0; $i<=$#kea; $i++){
149 $kea_phrase_counts[$i] = 0;
150 }
151
152 for($i=0; $i<=$#stems; $i++){
153 $stem_phrase_counts[$i] = 0;
154 }
155
156 print STDERR "counting number of kea phrases in document...\n";
157
158 #using regular expressions generated from kea-reg and stem-reg
159 #count how many of each phrase appear in the document
160 $text_copy = $text;
161 for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
162 my $phrase = $kea[$i];
163 $reg = &kea_reg(split(/\s+/, $phrase));
164 while($text_copy =~ s/$reg//i){
165 $kea_phrase_counts[$i]++; #count the number of kea phrases
166 }
167 $text_copy = $text;
168 }
169
170 print STDERR "counting number of stemmed phrases in document...\n";
171
172 $text_copy = $text;
173 for($i=0; $i<=$#stems; $i++){ #search for text with stem phrases
174 my $stem = $stems[$i];
175 $reg = &stem_reg(split(/\s+/, $stem));
176 while($text_copy =~ s/$reg//i){
177 $stem_phrase_counts[$i]++; #count the number of stem phrases
178 }
179 $text_copy = $text;
180 }
181
182
183 #write data to keyphrase_document index
184 &keyphrase_document($ID, $keaS, $stemsS);
185
186 #write data to document_keyphrase index
187 $kea_counts = join(", ", @kea_phrase_counts);
188 $stem_counts = join(", ", @stem_phrase_counts);
189 &document_keyphrase($ID, $keaS, $stemsS, $kea_counts, $stem_counts);
190
191 } else {
192 print STDERR "No kea data was found in file $filepath\n";
193 }
194
195}
196
197#returns a regular expression designed to
198#search for stems in text
199#eg 'agri cari'
200# agri followed by 0 or more non-whitespace characters
201# followed by one or more whitespace OR 0 or 1 non-whitespace characters
202# cari followed by 0 or more non-whitespace characters
203#modified from original by Stephen Lundy
204
205sub stem_reg {
206
207 $regexp = "";
208
209 $l = @_;
210
211 if ($l > 0) {
212 $s = shift;
213 $regexp = "$s\\S*";
214
215 if ($l-1 > 0) {
216 foreach $s (@_) {
217 $regexp .= "(\\s+|\\S?)$s\\S*";
218 }
219 }
220 }
221
222 return $regexp;
223}
224
225#returns a regular expression designed to
226#search for phrases in text
227#eg 'agris caris'
228# agris followed by 0 or 1 non-whitespace characters OR
229# followed by one or more whitespace
230# caris followed by 0 or 1 non-whitespace characters
231#modified from original by Stephen Lundy
232
233sub kea_reg {
234 $regexp = "";
235
236 $l = @_;
237
238 if ($l > 0) {
239 $s = shift;
240 #$regexp = "$s(\\S?)";
241 $regexp = "$s(\\s+|\\S?)";
242
243 if ($l-1 > 0) {
244 foreach $s (@_) {
245 $regexp .= "$s(\\s+|\\S?)";
246 }
247 }
248 }
249
250 return $regexp;
251}
252
253
254#This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
255#check in the keyphrase index file for each phrase and determine whether or not an entry has
256#been made for that phrase and an index number assigned to it. If there has not been an entry
257#made then an index number is assigned to the phrase and it is written to the file. This
258#function is called by document_keyphrase and keyphrase_document. Each line in the file has
259#this form:
260#-phrase index number:phrase
261#This function then returns a table of pairs of the phrases that were sent as arguments to it
262#{phrase => phrase index number}.
263
264sub keyphrase_index_search {
265
266 my $phrases = shift(@_);
267 my @phrases = split(", ", $phrases);
268 my %table;
269 my $index = 1;
270 my $create_new_index = 0;
271
272 print STDERR "searching keyphrase index...\n";
273
274 #initilise table of phrases and index numbers
275 foreach $phrase (@phrases){
276 $table{"$phrase"} = "0";
277 }
278
279 #open keyphrase index for appending data and for reading
280 open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt");
281 open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt")
282 or $create_new_index = 1;
283
284 if($create_new_index == 0){
285 #if the index already exists read in the phrases
286 while(<INDEX_IN>){
287 chomp;
288 foreach $phrase (@phrases){
289 if(/(\d+):$phrase/){
290 $index = $1;
291 $table{"$phrase"} = "$index";
292 }
293 }
294 $index++; #new starting index (one + the last index)
295 }
296
297 close(INDEX_IN);
298
299 }
300
301 #add new phrases to the phrase index
302 foreach $phrase (keys %table){
303 if($table{"$phrase"} eq "0"){
304 print INDEX_OUT "$index:$phrase\n";
305 $table{"$phrase"} = "$index";
306 $index++;
307 }
308 }
309
310 close(INDEX_OUT);
311 return %table;
312}
313
314#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
315#that exist for that particular file. Its purpose is to write to the keyphrase_document
316#index a line for the document it has been sent:
317#-phrase index number:number of documents it appears in|ID
318sub keyphrase_document{
319
320 my ($ID, $kea, $stems) = @_;
321 my $text = "";
322 my @textlist;
323 my $create_new_index = 0;
324
325 print STDERR "writing to keyphrase_document.txt...\n";
326
327 #get table of phrases and phrase indexes
328 my %table = keyphrase_index_search($kea.", ".$stems);
329
330
331 #open index for reading
332 open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
333 or $create_new_index = 1;
334
335 #read in document if file exists
336 if($create_new_index == 0){
337
338 while(<INDEX_IN>){
339 $text .= $_;
340 }
341
342 close(INDEX_IN);
343
344 #split text into lines
345 @textlist = split(/\n/, $text);
346
347 }
348
349 #open index for output
350 open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt");
351
352 if($create_new_index == 0){ #amend existing index
353
354 foreach $line (@textlist){
355 foreach $phrase (keys %table){
356 if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
357 $index = $1;
358 if($table{"$phrase"} eq "$index") { #if phrase exists in index
359 $ids = $3; #get all doc IDs for that keyphrase
360 if($ids !~ /$ID/){ #if doc ID not already included
361 $num_docs = $2;
362 $num_docs++; #increment number of docs
363 $line = "$index:$num_docs$3|$ID"; #line to append to index
364 $table{"$phrase"} = "0";
365 }
366 }
367 }
368 }
369 print INDEX_OUT "$line\n";
370 }
371 }
372
373 #add new phrases to the index
374 foreach $phrase (keys %table){ #write 'phrase index:1:file ID
375 if($table{"$phrase"} ne "0"){
376 my $line = "$table{$phrase}:1:$ID";
377 print INDEX_OUT "$line\n";
378 }
379 }
380
381 close(INDEX_OUT);
382
383}
384
385#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
386#that exist for that particular file and a list of the number of times each kea and/or stem
387#phrase appear in that document. Its purpose is to write to the document_keyphrase
388#index a line for the document it has been sent:
389#-file ID:number of phrases and/or stems appear in the document
390# |pairs of 'phrase index,number of times the phrase appears in the document'
391sub document_keyphrase {
392
393 my ($ID, $keaS, $stemsS, $kea_c, $stem_c) = @_;
394 my $text = "";
395 my @textlist;
396 my %phrases;
397 my $create_new_index = 0;
398
399 print STDERR "writing to document_keyphrase.txt...\n";
400
401 #split phrase counts into arrays
402 my @kea_counts = split(", ", $kea_c);
403 my @stem_counts = split(", ", $stem_c);
404
405 #get table of phrases and phrase indexes
406 my %table = keyphrase_index_search($keaS.", ".$stemsS);
407
408 #split phrases into arrays
409 my @kea = split(", ", $keaS);
410 my @stems = split(", ", $stemsS);
411
412 #build new phrases dictionary
413 for($i=0; $i<=$#kea; $i++){
414 my $phrase = $table{"$kea[$i]"};
415 if($kea_counts[$i] > 0){
416 $phrases{"$phrase"} = "$kea_counts[$i]";
417 } else {
418 $phrases{"$phrase"} = 1;
419 }
420 }
421 for($i=0; $i<=$#stems; $i++){
422 my $phrase = $table{"$stems[$i]"};
423 if($stem_counts[$i] > 0){
424 $phrases{"$phrase"} = "$stem_counts[$i]";
425 } else {
426 $phrases{"$phrase"} = 1;
427 }
428 }
429 my @num = keys %phrases;
430 my $phrasenum = $#num + 1; #number of phrases in doc
431
432 #open index for reading
433 open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
434 or $create_new_index = 1;
435
436
437 if($create_new_index == 0){ #index doesn't need to be created
438
439 while(<INDEX_IN>){
440 $text .= $_;
441 }
442
443 close(INDEX_IN);
444
445 #split text into lines
446 @textlist = split(/\n/, $text);
447
448 }
449
450
451 #must write this line to the file
452 #document ID:num of phrases|phrase index, number of times phrases appears
453 my $newline = "$ID:$phrasenum";
454 foreach $phrase (keys %phrases){
455 $newline .= "|$phrase,$phrases{$phrase}";
456 }
457
458 #open index for output
459 open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt");
460
461 if($create_new_index == 1){ #create a new index
462
463 print INDEX_OUT "$newline\n";
464
465 } else {
466
467 #if ID is already in the file write line overtop incase
468 #someone has modified the file. Otherwise add the line
469 #to the end of the file
470 my $found = 0;
471
472 foreach $line (@textlist){
473 if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
474 $id = $1;
475 if($ID eq $id){ #id is already in the file
476 print INDEX_OUT "$newline\n"; #print line overtop
477 $found = 1;
478 } else {
479 print INDEX_OUT "$line\n"; #print old line out
480 }
481 }
482 }
483
484 print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
485
486 }
487
488 close(INDEX_OUT);
489
490}
491
Note: See TracBrowser for help on using the repository browser.