source: trunk/gsdl/bin/script/indexes/buildkpiK.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
RevLine 
[1971]1#! /user/bin/perl
2
3#usage: perl buildkpiK.pl [-R] [collection] [collection] etc
4#
5#-r or -R will remove previous index files so that you may build new ones
6#
7#The program performs the following tasks:
8#-gathers the specified collections on the command line OR
9#-gathers the directories of all the collections in the collect directory, this is all
10# the directories apart from CVS, modelcol, . and .. which are not collections.
11#-It then retrieves the archive.inf file from the archive directory of each collection
12# to obtain the unique file ID and filepath of every document in the collection
13#-Then parse through each doc.gml stored in filepath to gather information
14#-From each file collect kea phrases.
15#-Determine the number of kea phrases for each document
16#-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
17# the program will write the kea phrase to the phrase index .
18#-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
19# it will increment the number of documents that the keyphrase appears in and replace that
20# then append the hash ID to the list of documents in the entry. If the phrase is not there
21# then the program will write the kea phrase to the phrase index.
22#-Then write document ID, no of phrase, phrase number from index followed by number of times
23# phrase appears into the document_keyphrase index
24
25$gsdlhome = $ENV{'GSDLHOME'};
26$collection;
27
28require "getopts.pl";
29&Getopts('R'); #process option arguments
30
31#if option R remove all previous indexes
32if($opt_R == 1){ #remove indexes
33 print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
34 print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
35 print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
36 system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
37 system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
38 system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
39}
40
41#collections may be specified in the command line
42#otherwise, all collections will be used to build
43#the indexes.
44if(@ARGV){
45
46 @directories = @ARGV;
47
48} else { #open collect directory and get a list of all collections
49 opendir(DIR, "$gsdlhome/collect");
50 @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
51 closedir(DIR);
52}
53
54#for each collection specified to build indexes for
55foreach $collection (@directories){
56
57 my @filelist;
58
59 #archives.inf contains a list of unique hash ID's of each file and file paths
60 open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
61 or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
62
63 while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
64 chomp;
65 push(@filelist, $_);
66 }
67
68 foreach $file (@filelist){ #add each document to the indexes
69 build_index($file, $collection);
70 }
71}
72
73#This function opens the file in the filepath sent as an argument. From this it obtains
74#the kea data, and then searches for these phrases in the file, counting and storing
75#how many times each phrase appears. The data is then sent to function keyphrase_document
76#with arguments hash ID, kea phrases to build the keyphrase_document index.
77#The function which builds the document_keyphrase index is then passed the hash ID, the kea
78#phrases and the array/s which hold the number of times each phrase
79#appears in the document so that the data it has collected can be written to document_ keyphrase
80#index.
81
82sub build_index {
83
84 my $args = shift(@_);
85 my $collection = shift(@_);
86 my ($ID, $filepath) = split(/\t/, $args);
87 my $keaS = "";
88 my @kea_phrase_counts = 0;
89 my $text = "";
90
91 print STDERR "\nID: $ID\n";
92 print STDERR "filepath: $filepath\n";
93
94 #open file to extract keyphrase information
95 open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
96 or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
97
98 #patterns to search for so that we can extract the kea information
99 my $kea_search = ".* kea=\"([^\"]*)\"";
100
101 while(<FILE>){ #get kea and stem data and store
102 chomp;
103 $keaS = $1 if (/$kea_search/);
104 }
105
106 close(FILE);
107
108 print STDERR "Kea: $keaS\n";
109
110 my @kea = split(", ", $keaS);
111
112 if(@kea){ #if the data exists
113
114 #open the filepath to the current document
115 open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
116 or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
117
118 while(<FILE>){ #get the text
119 chomp;
120 $text .= $_;
121 }
122
123 #chop out all things in angled brackets
124 $text =~ s/(<[^>]*>)//g;
125
126 #initilise counts
127 for($i=0; $i<=$#kea; $i++){
128 $kea_phrase_counts[$i] = 0;
129 }
130
131
132 #using regular expressions generated from kea-reg
133 #count how many of each phrase appear in the document
134 $text_copy = $text;
135 for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
136 my $phrase = $kea[$i];
137 $reg = &kea_reg(split(/\s+/, $phrase));
138 while($text_copy =~ s/$reg//i){
139 $kea_phrase_counts[$i]++; #count the number of kea phrases
140 }
141 $text_copy = $text;
142 }
143
144 #write data to keyphrase_document index
145 &keyphrase_document($ID, $keaS);
146
147 #write data to document_keyphrase index
148 $kea_counts = join(", ", @kea_phrase_counts);
149 &document_keyphrase($ID, $keaS, $kea_counts);
150
151 } else {
152 print STDERR "No kea data was found in file $filepath\n";
153 }
154
155}
156
157
158#returns a regular expression designed to
159#search for phrases in text
160#eg 'agris caris'
161# agris followed by 0 or 1 non-whitespace characters OR
162# followed by one or more whitespace
163# caris followed by 0 or 1 non-whitespace characters
164#modified from original by Stephen Lundy
165
166sub kea_reg {
167 $regexp = "";
168
169 $l = @_;
170
171 if ($l > 0) {
172 $s = shift;
173 #$regexp = "$s(\\S?)";
174 $regexp = "$s(\\s+|\\S?)";
175
176 if ($l-1 > 0) {
177 foreach $s (@_) {
178 $regexp .= "$s(\\s+|\\S?)";
179 }
180 }
181 }
182
183 return $regexp;
184}
185
186
187#This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
188#check in the keyphrase index file for each phrase and determine whether or not an entry has
189#been made for that phrase and an index number assigned to it. If there has not been an entry
190#made then an index number is assigned to the phrase and it is written to the file. This
191#function is called by document_keyphrase and keyphrase_document. Each line in the file has
192#this form:
193#-phrase index number:phrase
194#This function then returns a table of pairs of the phrases that were sent as arguments to it
195#{phrase => phrase index number}.
196
197sub keyphrase_index_search {
198
199 my $phrases = shift(@_);
200 my @phrases = split(", ", $phrases);
201 my %table;
202 my $index = 1;
203 my $create_new_index = 0;
204
205 print STDERR "searching keyphrase index...\n";
206
207 #initilise table of phrases and index numbers
208 foreach $phrase (@phrases){
209 $table{"$phrase"} = "0";
210 }
211
212 #open keyphrase index for appending data and for reading
213 open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
214 open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
215 or $create_new_index = 1;
216
217 if($create_new_index == 0){
218 #if the index already exists read in the phrases
219 while(<INDEX_IN>){
220 chomp;
221 foreach $phrase (@phrases){
222 if(/(\d+):$phrase/){
223 $index = $1;
224 $table{"$phrase"} = "$index";
225 }
226 }
227 $index++; #new starting index (one + the last index)
228 }
229
230 close(INDEX_IN);
231
232 }
233
234 #add new phrases to the phrase index
235 foreach $phrase (keys %table){
236 if($table{"$phrase"} eq "0"){
237 print INDEX_OUT "$index:$phrase\n";
238 $table{"$phrase"} = "$index";
239 $index++;
240 }
241 }
242
243 close(INDEX_OUT);
244 return %table;
245}
246
247#This function is passed as arguments file hash ID and a list of kea phrases
248#that exist for that particular file. Its purpose is to write to the keyphrase_document
249#index a line for the document it has been sent:
250#-phrase index number:number of documents it appears in|ID
251sub keyphrase_document{
252
253 my ($ID, $kea) = @_;
254 my $text = "";
255 my @textlist;
256 my $create_new_index = 0;
257
258 print STDERR "writing to keyphrase_document.txt...\n";
259
260 #get table of phrases and phrase indexes
261 my %table = keyphrase_index_search($kea);
262
263
264 #open index for reading
265 open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
266 or $create_new_index = 1;
267
268 #read in document if file exists
269 if($create_new_index == 0){
270
271 while(<INDEX_IN>){
272 $text .= $_;
273 }
274
275 close(INDEX_IN);
276
277 #split text into lines
278 @textlist = split(/\n/, $text);
279
280 }
281
282 #open index for output
283 open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
284
285 if($create_new_index == 0){ #amend existing index
286
287 foreach $line (@textlist){
288 foreach $phrase (keys %table){
289 if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
290 $index = $1;
291 if($table{"$phrase"} eq "$index") { #if phrase exists in index
292 $ids = $3; #get all doc IDs for that keyphrase
293 if($ids !~ /$ID/){ #if doc ID not already included
294 $num_docs = $2;
295 $num_docs++; #increment number of docs
296 $line = "$index:$num_docs$3|$ID"; #line to append to index
297 $table{"$phrase"} = "0";
298 }
299 }
300 }
301 }
302 print INDEX_OUT "$line\n";
303 }
304 }
305
306 #add new phrases to the index
307 foreach $phrase (keys %table){ #write 'phrase index:1:file ID
308 if($table{"$phrase"} ne "0"){
309 my $line = "$table{$phrase}:1:$ID";
310 print INDEX_OUT "$line\n";
311 }
312 }
313
314 close(INDEX_OUT);
315
316}
317
318#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
319#that exist for that particular file and a list of the number of times each kea and/or stem
320#phrase appear in that document. Its purpose is to write to the document_keyphrase
321#index a line for the document it has been sent:
322#-file ID:number of phrases and/or stems appear in the document
323# |pairs of 'phrase index,number of times the phrase appears in the document'
324sub document_keyphrase {
325
326 my ($ID, $keaS, $kea_c) = @_;
327 my $text = "";
328 my @textlist;
329 my %phrases;
330 my $create_new_index = 0;
331
332 print STDERR "writing to document_keyphrase.txt...\n";
333
334 #split phrase counts into arrays
335 my @kea_counts = split(", ", $kea_c);
336
337 #get table of phrases and phrase indexes
338 my %table = keyphrase_index_search($keaS);
339
340 #split phrases into arrays
341 my @kea = split(", ", $keaS);
342
343 #build new phrases dictionary
344 for($i=0; $i<=$#kea; $i++){
345 my $phrase = $table{"$kea[$i]"};
346 $phrases{"$phrase"} = "$kea_counts[$i]";
347 }
348
349 my @num = keys %phrases;
350 my $phrasenum = $#num + 1; #number of phrases in doc
351
352 #open index for reading
353 open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
354 or $create_new_index = 1;
355
356
357 if($create_new_index == 0){ #index doesn't need to be created
358
359 while(<INDEX_IN>){
360 $text .= $_;
361 }
362
363 close(INDEX_IN);
364
365 #split text into lines
366 @textlist = split(/\n/, $text);
367
368 }
369
370
371 #must write this line to the file
372 #'document ID:num of phrases|phrase index, number of times phrases appears
373 my $newline = "$ID:$phrasenum";
374 foreach $phrase (keys %phrases){
375 $newline .= "|$phrase,$phrases{$phrase}";
376 }
377
378 #open index for output
379 open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
380
381 if($create_new_index == 1){ #create a new index
382
383 print INDEX_OUT "$newline\n";
384
385 } else {
386
387 #if ID is already in the file write line overtop incase
388 #someone has modified the file. Otherwise add the line
389 #to the end of the file
390 my $found = 0;
391
392 foreach $line (@textlist){
393 if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
394 $id = $1;
395 if($ID eq $id){ #id is already in the file
396 print INDEX_OUT "$newline\n"; #print line overtop
397 $found = 1;
398 } else {
399 print INDEX_OUT "$line\n"; #print old line out
400 }
401 }
402 }
403
404 print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
405
406 }
407
408 close(INDEX_OUT);
409
410}
411
Note: See TracBrowser for help on using the repository browser.