1 | #! /user/bin/perl
|
---|
2 |
|
---|
3 | #usage: perl buildkpi.pl [-R] [collection] [collection] etc
|
---|
4 | #
|
---|
5 | #-r or -R will remove previous index files so that you may build new ones
|
---|
6 | #
|
---|
7 | #The program performs the following tasks:
|
---|
8 | #-gathers the specified collections on the command line OR
|
---|
9 | #-gathers the directories of all the collections in the collect directory, this is all
|
---|
10 | # the directories apart from CVS, modelcol, . and .. which are not collections.
|
---|
11 | #-It then retrieves the archive.inf file from the archive directory of each collection
|
---|
12 | # to obtain the unique file ID and filepath of every document in the collection
|
---|
13 | #-Then parse through each doc.gml stored in filepath to gather information
|
---|
14 | #-From each file collect kea phrases and/or stems
|
---|
15 | #-Determine the number of kea phrases and stems for each document
|
---|
16 | #-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
|
---|
17 | # the program will write the kea phrase to the phrase index .
|
---|
18 | #-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
|
---|
19 | # it will increment the number of documents that the keyphrase appears in and replace that
|
---|
20 | # then append the hash ID to the list of documents in the entry. If the phrase is not there
|
---|
21 | # then the program will write the kea phrase to the phrase index.
|
---|
22 | #-Then write document ID, no of phrase, phrase number from index followed by number of times
|
---|
23 | # phrase appears into the document_keyphrase index
|
---|
24 |
|
---|
25 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
26 |
|
---|
27 | require "getopts.pl";
|
---|
28 | &Getopts('R'); #process option arguments
|
---|
29 |
|
---|
30 |
|
---|
31 | #collections may be specified in the command line
|
---|
32 | #otherwise, all collections will be used to build
|
---|
33 | #the indexes.
|
---|
34 | if(@ARGV){
|
---|
35 |
|
---|
36 | @directories = @ARGV;
|
---|
37 |
|
---|
38 | } else { #open collect directory and get a list of all collections
|
---|
39 | opendir(DIR, "$gsdlhome/collect");
|
---|
40 | @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
|
---|
41 | closedir(DIR);
|
---|
42 | }
|
---|
43 |
|
---|
44 |
|
---|
45 | #directory to store indexes in using collection names
|
---|
46 | $dirname = "";
|
---|
47 | foreach $collection (sort @directories){
|
---|
48 | $dirname .= $collection."_";
|
---|
49 | }
|
---|
50 | $dirname .= "indexes";
|
---|
51 | print STDERR "directory name: $dirname\n";
|
---|
52 |
|
---|
53 | #if option R remove all previous indexes
|
---|
54 | if($opt_R == 1){ #remove indexes
|
---|
55 | print STDERR "\nremoving $gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt\n";
|
---|
56 | print STDERR "removing $gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt\n";
|
---|
57 | print STDERR "removing $gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt\n";
|
---|
58 | system("rm $gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt");
|
---|
59 | system("rm $gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt");
|
---|
60 | system("rm $gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt");
|
---|
61 | }
|
---|
62 |
|
---|
63 | #create the new directory, display an error message if the directory
|
---|
64 | #already exists.
|
---|
65 | `mkdir --verbose $gsdlhome/bin/script/indexes/$dirname`;
|
---|
66 |
|
---|
67 | #for each collection specified to build indexes for
|
---|
68 | foreach $collection (@directories){
|
---|
69 |
|
---|
70 | print STDERR "\nBUILDING INDEXES FOR COLLECTION $collection\n\n";
|
---|
71 |
|
---|
72 | my @filelist;
|
---|
73 |
|
---|
74 | #archives.inf contains a list of unique hash ID's of each file and file paths
|
---|
75 | open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
|
---|
76 | or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
|
---|
77 |
|
---|
78 | while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
|
---|
79 | chomp;
|
---|
80 | push(@filelist, $_);
|
---|
81 | }
|
---|
82 |
|
---|
83 | foreach $file (@filelist){ #add each document to the indexes
|
---|
84 | build_index($file, $collection);
|
---|
85 | }
|
---|
86 | }
|
---|
87 |
|
---|
88 | #This function opens the file in the filepath sent as an argument. From this it obtains
|
---|
89 | #the kea and/or stem data, and then searches for these phrases in the file, counting and storing
|
---|
90 | #how many times each phrase appears. The data is then sent to function keyphrase_document
|
---|
91 | #with arguments hash ID, kea phrases and stem phrases to build the keyphrase_document index.
|
---|
92 | #The function which builds the document_keyphrase index is then passed the hash ID, the kea
|
---|
93 | #phrases and/or the stemmed phrases and the array/s which hold the number of times each phrase
|
---|
94 | #appears in the document so that the data it has collected can be written to document_ keyphrase
|
---|
95 | #index.
|
---|
96 |
|
---|
97 | sub build_index {
|
---|
98 |
|
---|
99 | my $args = shift(@_);
|
---|
100 | my $collection = shift(@_);
|
---|
101 | my ($ID, $filepath) = split(/\t/, $args);
|
---|
102 | my $keaS = "";
|
---|
103 | my $stemsS = "";
|
---|
104 | my @kea_phrase_counts = 0;
|
---|
105 | my @stem_phrase_counts = 0;
|
---|
106 | my $text = "";
|
---|
107 |
|
---|
108 | print STDERR "\nID: $ID\n";
|
---|
109 | print STDERR "filepath: $filepath\n";
|
---|
110 |
|
---|
111 | #open file to extract keyphrase information
|
---|
112 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
113 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
114 |
|
---|
115 | #patterns to search for so that we can extract the kea information
|
---|
116 | my $kea_search = ".* kea=\"([^\"]*)\"";
|
---|
117 | my $stem_search = "stems=\"([^\"]*)\"";
|
---|
118 |
|
---|
119 | while(<FILE>){ #get kea and stem data and store
|
---|
120 | chomp;
|
---|
121 | $keaS = $1 if (/$kea_search/);
|
---|
122 | $stemsS = $1 if (/$stem_search/);
|
---|
123 | }
|
---|
124 |
|
---|
125 | close(FILE);
|
---|
126 |
|
---|
127 | print STDERR "Kea: $keaS\n";
|
---|
128 | print STDERR "stems: $stemsS\n";
|
---|
129 |
|
---|
130 | my @kea = split(", ", $keaS);
|
---|
131 | my @stems = split(", ", $stemsS);
|
---|
132 |
|
---|
133 | if(@kea && @stems){ #if the data exists
|
---|
134 |
|
---|
135 | #open the filepath to the current document
|
---|
136 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
137 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
138 |
|
---|
139 | while(<FILE>){ #get the text
|
---|
140 | chomp;
|
---|
141 | $text .= $_;
|
---|
142 | }
|
---|
143 |
|
---|
144 | #chop out all things in angled brackets
|
---|
145 | $text =~ s/(<[^>]*>)//g;
|
---|
146 |
|
---|
147 | #initilise counts
|
---|
148 | for($i=0; $i<=$#kea; $i++){
|
---|
149 | $kea_phrase_counts[$i] = 0;
|
---|
150 | }
|
---|
151 |
|
---|
152 | for($i=0; $i<=$#stems; $i++){
|
---|
153 | $stem_phrase_counts[$i] = 0;
|
---|
154 | }
|
---|
155 |
|
---|
156 | print STDERR "counting number of kea phrases in document...\n";
|
---|
157 |
|
---|
158 | #using regular expressions generated from kea-reg and stem-reg
|
---|
159 | #count how many of each phrase appear in the document
|
---|
160 | $text_copy = $text;
|
---|
161 | for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
|
---|
162 | my $phrase = $kea[$i];
|
---|
163 | $reg = &kea_reg(split(/\s+/, $phrase));
|
---|
164 | while($text_copy =~ s/$reg//i){
|
---|
165 | $kea_phrase_counts[$i]++; #count the number of kea phrases
|
---|
166 | }
|
---|
167 | $text_copy = $text;
|
---|
168 | }
|
---|
169 |
|
---|
170 | print STDERR "counting number of stemmed phrases in document...\n";
|
---|
171 |
|
---|
172 | $text_copy = $text;
|
---|
173 | for($i=0; $i<=$#stems; $i++){ #search for text with stem phrases
|
---|
174 | my $stem = $stems[$i];
|
---|
175 | $reg = &stem_reg(split(/\s+/, $stem));
|
---|
176 | while($text_copy =~ s/$reg//i){
|
---|
177 | $stem_phrase_counts[$i]++; #count the number of stem phrases
|
---|
178 | }
|
---|
179 | $text_copy = $text;
|
---|
180 | }
|
---|
181 |
|
---|
182 |
|
---|
183 | #write data to keyphrase_document index
|
---|
184 | &keyphrase_document($ID, $keaS, $stemsS);
|
---|
185 |
|
---|
186 | #write data to document_keyphrase index
|
---|
187 | $kea_counts = join(", ", @kea_phrase_counts);
|
---|
188 | $stem_counts = join(", ", @stem_phrase_counts);
|
---|
189 | &document_keyphrase($ID, $keaS, $stemsS, $kea_counts, $stem_counts);
|
---|
190 |
|
---|
191 | } else {
|
---|
192 | print STDERR "No kea data was found in file $filepath\n";
|
---|
193 | }
|
---|
194 |
|
---|
195 | }
|
---|
196 |
|
---|
197 | #returns a regular expression designed to
|
---|
198 | #search for stems in text
|
---|
199 | #eg 'agri cari'
|
---|
200 | # agri followed by 0 or more non-whitespace characters
|
---|
201 | # followed by one or more whitespace OR 0 or 1 non-whitespace characters
|
---|
202 | # cari followed by 0 or more non-whitespace characters
|
---|
203 | #modified from original by Stephen Lundy
|
---|
204 |
|
---|
205 | sub stem_reg {
|
---|
206 |
|
---|
207 | $regexp = "";
|
---|
208 |
|
---|
209 | $l = @_;
|
---|
210 |
|
---|
211 | if ($l > 0) {
|
---|
212 | $s = shift;
|
---|
213 | $regexp = "$s\\S*";
|
---|
214 |
|
---|
215 | if ($l-1 > 0) {
|
---|
216 | foreach $s (@_) {
|
---|
217 | $regexp .= "(\\s+|\\S?)$s\\S*";
|
---|
218 | }
|
---|
219 | }
|
---|
220 | }
|
---|
221 |
|
---|
222 | return $regexp;
|
---|
223 | }
|
---|
224 |
|
---|
225 | #returns a regular expression designed to
|
---|
226 | #search for phrases in text
|
---|
227 | #eg 'agris caris'
|
---|
228 | # agris followed by 0 or 1 non-whitespace characters OR
|
---|
229 | # followed by one or more whitespace
|
---|
230 | # caris followed by 0 or 1 non-whitespace characters
|
---|
231 | #modified from original by Stephen Lundy
|
---|
232 |
|
---|
233 | sub kea_reg {
|
---|
234 | $regexp = "";
|
---|
235 |
|
---|
236 | $l = @_;
|
---|
237 |
|
---|
238 | if ($l > 0) {
|
---|
239 | $s = shift;
|
---|
240 | #$regexp = "$s(\\S?)";
|
---|
241 | $regexp = "$s(\\s+|\\S?)";
|
---|
242 |
|
---|
243 | if ($l-1 > 0) {
|
---|
244 | foreach $s (@_) {
|
---|
245 | $regexp .= "$s(\\s+|\\S?)";
|
---|
246 | }
|
---|
247 | }
|
---|
248 | }
|
---|
249 |
|
---|
250 | return $regexp;
|
---|
251 | }
|
---|
252 |
|
---|
253 |
|
---|
254 | #This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
|
---|
255 | #check in the keyphrase index file for each phrase and determine whether or not an entry has
|
---|
256 | #been made for that phrase and an index number assigned to it. If there has not been an entry
|
---|
257 | #made then an index number is assigned to the phrase and it is written to the file. This
|
---|
258 | #function is called by document_keyphrase and keyphrase_document. Each line in the file has
|
---|
259 | #this form:
|
---|
260 | #-phrase index number:phrase
|
---|
261 | #This function then returns a table of pairs of the phrases that were sent as arguments to it
|
---|
262 | #{phrase => phrase index number}.
|
---|
263 |
|
---|
264 | sub keyphrase_index_search {
|
---|
265 |
|
---|
266 | my $phrases = shift(@_);
|
---|
267 | my @phrases = split(", ", $phrases);
|
---|
268 | my %table;
|
---|
269 | my $index = 1;
|
---|
270 | my $create_new_index = 0;
|
---|
271 |
|
---|
272 | print STDERR "searching keyphrase index...\n";
|
---|
273 |
|
---|
274 | #initilise table of phrases and index numbers
|
---|
275 | foreach $phrase (@phrases){
|
---|
276 | $table{"$phrase"} = "0";
|
---|
277 | }
|
---|
278 |
|
---|
279 | #open keyphrase index for appending data and for reading
|
---|
280 | open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt");
|
---|
281 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt")
|
---|
282 | or $create_new_index = 1;
|
---|
283 |
|
---|
284 | if($create_new_index == 0){
|
---|
285 | #if the index already exists read in the phrases
|
---|
286 | while(<INDEX_IN>){
|
---|
287 | chomp;
|
---|
288 | foreach $phrase (@phrases){
|
---|
289 | if(/(\d+):$phrase/){
|
---|
290 | $index = $1;
|
---|
291 | $table{"$phrase"} = "$index";
|
---|
292 | }
|
---|
293 | }
|
---|
294 | $index++; #new starting index (one + the last index)
|
---|
295 | }
|
---|
296 |
|
---|
297 | close(INDEX_IN);
|
---|
298 |
|
---|
299 | }
|
---|
300 |
|
---|
301 | #add new phrases to the phrase index
|
---|
302 | foreach $phrase (keys %table){
|
---|
303 | if($table{"$phrase"} eq "0"){
|
---|
304 | print INDEX_OUT "$index:$phrase\n";
|
---|
305 | $table{"$phrase"} = "$index";
|
---|
306 | $index++;
|
---|
307 | }
|
---|
308 | }
|
---|
309 |
|
---|
310 | close(INDEX_OUT);
|
---|
311 | return %table;
|
---|
312 | }
|
---|
313 |
|
---|
314 | #This function is passed as arguments file hash ID and a list of kea phrases and/or stems
|
---|
315 | #that exist for that particular file. Its purpose is to write to the keyphrase_document
|
---|
316 | #index a line for the document it has been sent:
|
---|
317 | #-phrase index number:number of documents it appears in|ID
|
---|
318 | sub keyphrase_document{
|
---|
319 |
|
---|
320 | my ($ID, $kea, $stems) = @_;
|
---|
321 | my $text = "";
|
---|
322 | my @textlist;
|
---|
323 | my $create_new_index = 0;
|
---|
324 |
|
---|
325 | print STDERR "writing to keyphrase_document.txt...\n";
|
---|
326 |
|
---|
327 | #get table of phrases and phrase indexes
|
---|
328 | my %table = keyphrase_index_search($kea.", ".$stems);
|
---|
329 |
|
---|
330 |
|
---|
331 | #open index for reading
|
---|
332 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
|
---|
333 | or $create_new_index = 1;
|
---|
334 |
|
---|
335 | #read in document if file exists
|
---|
336 | if($create_new_index == 0){
|
---|
337 |
|
---|
338 | while(<INDEX_IN>){
|
---|
339 | $text .= $_;
|
---|
340 | }
|
---|
341 |
|
---|
342 | close(INDEX_IN);
|
---|
343 |
|
---|
344 | #split text into lines
|
---|
345 | @textlist = split(/\n/, $text);
|
---|
346 |
|
---|
347 | }
|
---|
348 |
|
---|
349 | #open index for output
|
---|
350 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt");
|
---|
351 |
|
---|
352 | if($create_new_index == 0){ #amend existing index
|
---|
353 |
|
---|
354 | foreach $line (@textlist){
|
---|
355 | foreach $phrase (keys %table){
|
---|
356 | if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
|
---|
357 | $index = $1;
|
---|
358 | if($table{"$phrase"} eq "$index") { #if phrase exists in index
|
---|
359 | $ids = $3; #get all doc IDs for that keyphrase
|
---|
360 | if($ids !~ /$ID/){ #if doc ID not already included
|
---|
361 | $num_docs = $2;
|
---|
362 | $num_docs++; #increment number of docs
|
---|
363 | $line = "$index:$num_docs$3|$ID"; #line to append to index
|
---|
364 | $table{"$phrase"} = "0";
|
---|
365 | }
|
---|
366 | }
|
---|
367 | }
|
---|
368 | }
|
---|
369 | print INDEX_OUT "$line\n";
|
---|
370 | }
|
---|
371 | }
|
---|
372 |
|
---|
373 | #add new phrases to the index
|
---|
374 | foreach $phrase (keys %table){ #write 'phrase index:1:file ID
|
---|
375 | if($table{"$phrase"} ne "0"){
|
---|
376 | my $line = "$table{$phrase}:1:$ID";
|
---|
377 | print INDEX_OUT "$line\n";
|
---|
378 | }
|
---|
379 | }
|
---|
380 |
|
---|
381 | close(INDEX_OUT);
|
---|
382 |
|
---|
383 | }
|
---|
384 |
|
---|
385 | #This function is passed as arguments file hash ID and a list of kea phrases and/or stems
|
---|
386 | #that exist for that particular file and a list of the number of times each kea and/or stem
|
---|
387 | #phrase appear in that document. Its purpose is to write to the document_keyphrase
|
---|
388 | #index a line for the document it has been sent:
|
---|
389 | #-file ID:number of phrases and/or stems appear in the document
|
---|
390 | # |pairs of 'phrase index,number of times the phrase appears in the document'
|
---|
391 | sub document_keyphrase {
|
---|
392 |
|
---|
393 | my ($ID, $keaS, $stemsS, $kea_c, $stem_c) = @_;
|
---|
394 | my $text = "";
|
---|
395 | my @textlist;
|
---|
396 | my %phrases;
|
---|
397 | my $create_new_index = 0;
|
---|
398 |
|
---|
399 | print STDERR "writing to document_keyphrase.txt...\n";
|
---|
400 |
|
---|
401 | #split phrase counts into arrays
|
---|
402 | my @kea_counts = split(", ", $kea_c);
|
---|
403 | my @stem_counts = split(", ", $stem_c);
|
---|
404 |
|
---|
405 | #get table of phrases and phrase indexes
|
---|
406 | my %table = keyphrase_index_search($keaS.", ".$stemsS);
|
---|
407 |
|
---|
408 | #split phrases into arrays
|
---|
409 | my @kea = split(", ", $keaS);
|
---|
410 | my @stems = split(", ", $stemsS);
|
---|
411 |
|
---|
412 | #build new phrases dictionary
|
---|
413 | for($i=0; $i<=$#kea; $i++){
|
---|
414 | my $phrase = $table{"$kea[$i]"};
|
---|
415 | if($kea_counts[$i] > 0){
|
---|
416 | $phrases{"$phrase"} = "$kea_counts[$i]";
|
---|
417 | } else {
|
---|
418 | $phrases{"$phrase"} = 1;
|
---|
419 | }
|
---|
420 | }
|
---|
421 | for($i=0; $i<=$#stems; $i++){
|
---|
422 | my $phrase = $table{"$stems[$i]"};
|
---|
423 | if($stem_counts[$i] > 0){
|
---|
424 | $phrases{"$phrase"} = "$stem_counts[$i]";
|
---|
425 | } else {
|
---|
426 | $phrases{"$phrase"} = 1;
|
---|
427 | }
|
---|
428 | }
|
---|
429 | my @num = keys %phrases;
|
---|
430 | my $phrasenum = $#num + 1; #number of phrases in doc
|
---|
431 |
|
---|
432 | #open index for reading
|
---|
433 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
|
---|
434 | or $create_new_index = 1;
|
---|
435 |
|
---|
436 |
|
---|
437 | if($create_new_index == 0){ #index doesn't need to be created
|
---|
438 |
|
---|
439 | while(<INDEX_IN>){
|
---|
440 | $text .= $_;
|
---|
441 | }
|
---|
442 |
|
---|
443 | close(INDEX_IN);
|
---|
444 |
|
---|
445 | #split text into lines
|
---|
446 | @textlist = split(/\n/, $text);
|
---|
447 |
|
---|
448 | }
|
---|
449 |
|
---|
450 |
|
---|
451 | #must write this line to the file
|
---|
452 | #document ID:num of phrases|phrase index, number of times phrases appears
|
---|
453 | my $newline = "$ID:$phrasenum";
|
---|
454 | foreach $phrase (keys %phrases){
|
---|
455 | $newline .= "|$phrase,$phrases{$phrase}";
|
---|
456 | }
|
---|
457 |
|
---|
458 | #open index for output
|
---|
459 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt");
|
---|
460 |
|
---|
461 | if($create_new_index == 1){ #create a new index
|
---|
462 |
|
---|
463 | print INDEX_OUT "$newline\n";
|
---|
464 |
|
---|
465 | } else {
|
---|
466 |
|
---|
467 | #if ID is already in the file write line overtop incase
|
---|
468 | #someone has modified the file. Otherwise add the line
|
---|
469 | #to the end of the file
|
---|
470 | my $found = 0;
|
---|
471 |
|
---|
472 | foreach $line (@textlist){
|
---|
473 | if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
|
---|
474 | $id = $1;
|
---|
475 | if($ID eq $id){ #id is already in the file
|
---|
476 | print INDEX_OUT "$newline\n"; #print line overtop
|
---|
477 | $found = 1;
|
---|
478 | } else {
|
---|
479 | print INDEX_OUT "$line\n"; #print old line out
|
---|
480 | }
|
---|
481 | }
|
---|
482 | }
|
---|
483 |
|
---|
484 | print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
|
---|
485 |
|
---|
486 | }
|
---|
487 |
|
---|
488 | close(INDEX_OUT);
|
---|
489 |
|
---|
490 | }
|
---|
491 |
|
---|