1 | #! /user/bin/perl
|
---|
2 |
|
---|
3 | #usage: perl buildkpiK.pl [-R] [collection] [collection] etc
|
---|
4 | #
|
---|
5 | #-r or -R will remove previous index files so that you may build new ones
|
---|
6 | #
|
---|
7 | #The program performs the following tasks:
|
---|
8 | #-gathers the specified collections on the command line OR
|
---|
9 | #-gathers the directories of all the collections in the collect directory, this is all
|
---|
10 | # the directories apart from CVS, modelcol, . and .. which are not collections.
|
---|
11 | #-It then retrieves the archive.inf file from the archive directory of each collection
|
---|
12 | # to obtain the unique file ID and filepath of every document in the collection
|
---|
13 | #-Then parse through each doc.gml stored in filepath to gather information
|
---|
14 | #-From each file collect kea phrases.
|
---|
15 | #-Determine the number of kea phrases for each document
|
---|
16 | #-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
|
---|
17 | # the program will write the kea phrase to the phrase index .
|
---|
18 | #-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
|
---|
19 | # it will increment the number of documents that the keyphrase appears in and replace that
|
---|
20 | # then append the hash ID to the list of documents in the entry. If the phrase is not there
|
---|
21 | # then the program will write the kea phrase to the phrase index.
|
---|
22 | #-Then write document ID, no of phrase, phrase number from index followed by number of times
|
---|
23 | # phrase appears into the document_keyphrase index
|
---|
24 |
|
---|
25 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
26 | $collection;
|
---|
27 |
|
---|
28 | require "getopts.pl";
|
---|
29 | &Getopts('R'); #process option arguments
|
---|
30 |
|
---|
31 | #if option R remove all previous indexes
|
---|
32 | if($opt_R == 1){ #remove indexes
|
---|
33 | print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
|
---|
34 | print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
|
---|
35 | print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
|
---|
36 | system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
|
---|
37 | system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
|
---|
38 | system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
|
---|
39 | }
|
---|
40 |
|
---|
41 | #collections may be specified in the command line
|
---|
42 | #otherwise, all collections will be used to build
|
---|
43 | #the indexes.
|
---|
44 | if(@ARGV){
|
---|
45 |
|
---|
46 | @directories = @ARGV;
|
---|
47 |
|
---|
48 | } else { #open collect directory and get a list of all collections
|
---|
49 | opendir(DIR, "$gsdlhome/collect");
|
---|
50 | @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
|
---|
51 | closedir(DIR);
|
---|
52 | }
|
---|
53 |
|
---|
54 | #for each collection specified to build indexes for
|
---|
55 | foreach $collection (@directories){
|
---|
56 |
|
---|
57 | my @filelist;
|
---|
58 |
|
---|
59 | #archives.inf contains a list of unique hash ID's of each file and file paths
|
---|
60 | open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
|
---|
61 | or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
|
---|
62 |
|
---|
63 | while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
|
---|
64 | chomp;
|
---|
65 | push(@filelist, $_);
|
---|
66 | }
|
---|
67 |
|
---|
68 | foreach $file (@filelist){ #add each document to the indexes
|
---|
69 | build_index($file, $collection);
|
---|
70 | }
|
---|
71 | }
|
---|
72 |
|
---|
73 | #This function opens the file in the filepath sent as an argument. From this it obtains
|
---|
74 | #the kea data, and then searches for these phrases in the file, counting and storing
|
---|
75 | #how many times each phrase appears. The data is then sent to function keyphrase_document
|
---|
76 | #with arguments hash ID, kea phrases to build the keyphrase_document index.
|
---|
77 | #The function which builds the document_keyphrase index is then passed the hash ID, the kea
|
---|
78 | #phrases and the array/s which hold the number of times each phrase
|
---|
79 | #appears in the document so that the data it has collected can be written to document_ keyphrase
|
---|
80 | #index.
|
---|
81 |
|
---|
82 | sub build_index {
|
---|
83 |
|
---|
84 | my $args = shift(@_);
|
---|
85 | my $collection = shift(@_);
|
---|
86 | my ($ID, $filepath) = split(/\t/, $args);
|
---|
87 | my $keaS = "";
|
---|
88 | my @kea_phrase_counts = 0;
|
---|
89 | my $text = "";
|
---|
90 |
|
---|
91 | print STDERR "\nID: $ID\n";
|
---|
92 | print STDERR "filepath: $filepath\n";
|
---|
93 |
|
---|
94 | #open file to extract keyphrase information
|
---|
95 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
96 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
97 |
|
---|
98 | #patterns to search for so that we can extract the kea information
|
---|
99 | my $kea_search = ".* kea=\"([^\"]*)\"";
|
---|
100 |
|
---|
101 | while(<FILE>){ #get kea and stem data and store
|
---|
102 | chomp;
|
---|
103 | $keaS = $1 if (/$kea_search/);
|
---|
104 | }
|
---|
105 |
|
---|
106 | close(FILE);
|
---|
107 |
|
---|
108 | print STDERR "Kea: $keaS\n";
|
---|
109 |
|
---|
110 | my @kea = split(", ", $keaS);
|
---|
111 |
|
---|
112 | if(@kea){ #if the data exists
|
---|
113 |
|
---|
114 | #open the filepath to the current document
|
---|
115 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
116 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
117 |
|
---|
118 | while(<FILE>){ #get the text
|
---|
119 | chomp;
|
---|
120 | $text .= $_;
|
---|
121 | }
|
---|
122 |
|
---|
123 | #chop out all things in angled brackets
|
---|
124 | $text =~ s/(<[^>]*>)//g;
|
---|
125 |
|
---|
126 | #initilise counts
|
---|
127 | for($i=0; $i<=$#kea; $i++){
|
---|
128 | $kea_phrase_counts[$i] = 0;
|
---|
129 | }
|
---|
130 |
|
---|
131 |
|
---|
132 | #using regular expressions generated from kea-reg
|
---|
133 | #count how many of each phrase appear in the document
|
---|
134 | $text_copy = $text;
|
---|
135 | for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
|
---|
136 | my $phrase = $kea[$i];
|
---|
137 | $reg = &kea_reg(split(/\s+/, $phrase));
|
---|
138 | while($text_copy =~ s/$reg//i){
|
---|
139 | $kea_phrase_counts[$i]++; #count the number of kea phrases
|
---|
140 | }
|
---|
141 | $text_copy = $text;
|
---|
142 | }
|
---|
143 |
|
---|
144 | #write data to keyphrase_document index
|
---|
145 | &keyphrase_document($ID, $keaS);
|
---|
146 |
|
---|
147 | #write data to document_keyphrase index
|
---|
148 | $kea_counts = join(", ", @kea_phrase_counts);
|
---|
149 | &document_keyphrase($ID, $keaS, $kea_counts);
|
---|
150 |
|
---|
151 | } else {
|
---|
152 | print STDERR "No kea data was found in file $filepath\n";
|
---|
153 | }
|
---|
154 |
|
---|
155 | }
|
---|
156 |
|
---|
157 |
|
---|
158 | #returns a regular expression designed to
|
---|
159 | #search for phrases in text
|
---|
160 | #eg 'agris caris'
|
---|
161 | # agris followed by 0 or 1 non-whitespace characters OR
|
---|
162 | # followed by one or more whitespace
|
---|
163 | # caris followed by 0 or 1 non-whitespace characters
|
---|
164 | #modified from original by Stephen Lundy
|
---|
165 |
|
---|
166 | sub kea_reg {
|
---|
167 | $regexp = "";
|
---|
168 |
|
---|
169 | $l = @_;
|
---|
170 |
|
---|
171 | if ($l > 0) {
|
---|
172 | $s = shift;
|
---|
173 | #$regexp = "$s(\\S?)";
|
---|
174 | $regexp = "$s(\\s+|\\S?)";
|
---|
175 |
|
---|
176 | if ($l-1 > 0) {
|
---|
177 | foreach $s (@_) {
|
---|
178 | $regexp .= "$s(\\s+|\\S?)";
|
---|
179 | }
|
---|
180 | }
|
---|
181 | }
|
---|
182 |
|
---|
183 | return $regexp;
|
---|
184 | }
|
---|
185 |
|
---|
186 |
|
---|
187 | #This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
|
---|
188 | #check in the keyphrase index file for each phrase and determine whether or not an entry has
|
---|
189 | #been made for that phrase and an index number assigned to it. If there has not been an entry
|
---|
190 | #made then an index number is assigned to the phrase and it is written to the file. This
|
---|
191 | #function is called by document_keyphrase and keyphrase_document. Each line in the file has
|
---|
192 | #this form:
|
---|
193 | #-phrase index number:phrase
|
---|
194 | #This function then returns a table of pairs of the phrases that were sent as arguments to it
|
---|
195 | #{phrase => phrase index number}.
|
---|
196 |
|
---|
197 | sub keyphrase_index_search {
|
---|
198 |
|
---|
199 | my $phrases = shift(@_);
|
---|
200 | my @phrases = split(", ", $phrases);
|
---|
201 | my %table;
|
---|
202 | my $index = 1;
|
---|
203 | my $create_new_index = 0;
|
---|
204 |
|
---|
205 | print STDERR "searching keyphrase index...\n";
|
---|
206 |
|
---|
207 | #initilise table of phrases and index numbers
|
---|
208 | foreach $phrase (@phrases){
|
---|
209 | $table{"$phrase"} = "0";
|
---|
210 | }
|
---|
211 |
|
---|
212 | #open keyphrase index for appending data and for reading
|
---|
213 | open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
|
---|
214 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
|
---|
215 | or $create_new_index = 1;
|
---|
216 |
|
---|
217 | if($create_new_index == 0){
|
---|
218 | #if the index already exists read in the phrases
|
---|
219 | while(<INDEX_IN>){
|
---|
220 | chomp;
|
---|
221 | foreach $phrase (@phrases){
|
---|
222 | if(/(\d+):$phrase/){
|
---|
223 | $index = $1;
|
---|
224 | $table{"$phrase"} = "$index";
|
---|
225 | }
|
---|
226 | }
|
---|
227 | $index++; #new starting index (one + the last index)
|
---|
228 | }
|
---|
229 |
|
---|
230 | close(INDEX_IN);
|
---|
231 |
|
---|
232 | }
|
---|
233 |
|
---|
234 | #add new phrases to the phrase index
|
---|
235 | foreach $phrase (keys %table){
|
---|
236 | if($table{"$phrase"} eq "0"){
|
---|
237 | print INDEX_OUT "$index:$phrase\n";
|
---|
238 | $table{"$phrase"} = "$index";
|
---|
239 | $index++;
|
---|
240 | }
|
---|
241 | }
|
---|
242 |
|
---|
243 | close(INDEX_OUT);
|
---|
244 | return %table;
|
---|
245 | }
|
---|
246 |
|
---|
247 | #This function is passed as arguments file hash ID and a list of kea phrases
|
---|
248 | #that exist for that particular file. Its purpose is to write to the keyphrase_document
|
---|
249 | #index a line for the document it has been sent:
|
---|
250 | #-phrase index number:number of documents it appears in|ID
|
---|
251 | sub keyphrase_document{
|
---|
252 |
|
---|
253 | my ($ID, $kea) = @_;
|
---|
254 | my $text = "";
|
---|
255 | my @textlist;
|
---|
256 | my $create_new_index = 0;
|
---|
257 |
|
---|
258 | print STDERR "writing to keyphrase_document.txt...\n";
|
---|
259 |
|
---|
260 | #get table of phrases and phrase indexes
|
---|
261 | my %table = keyphrase_index_search($kea);
|
---|
262 |
|
---|
263 |
|
---|
264 | #open index for reading
|
---|
265 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
|
---|
266 | or $create_new_index = 1;
|
---|
267 |
|
---|
268 | #read in document if file exists
|
---|
269 | if($create_new_index == 0){
|
---|
270 |
|
---|
271 | while(<INDEX_IN>){
|
---|
272 | $text .= $_;
|
---|
273 | }
|
---|
274 |
|
---|
275 | close(INDEX_IN);
|
---|
276 |
|
---|
277 | #split text into lines
|
---|
278 | @textlist = split(/\n/, $text);
|
---|
279 |
|
---|
280 | }
|
---|
281 |
|
---|
282 | #open index for output
|
---|
283 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
|
---|
284 |
|
---|
285 | if($create_new_index == 0){ #amend existing index
|
---|
286 |
|
---|
287 | foreach $line (@textlist){
|
---|
288 | foreach $phrase (keys %table){
|
---|
289 | if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
|
---|
290 | $index = $1;
|
---|
291 | if($table{"$phrase"} eq "$index") { #if phrase exists in index
|
---|
292 | $ids = $3; #get all doc IDs for that keyphrase
|
---|
293 | if($ids !~ /$ID/){ #if doc ID not already included
|
---|
294 | $num_docs = $2;
|
---|
295 | $num_docs++; #increment number of docs
|
---|
296 | $line = "$index:$num_docs$3|$ID"; #line to append to index
|
---|
297 | $table{"$phrase"} = "0";
|
---|
298 | }
|
---|
299 | }
|
---|
300 | }
|
---|
301 | }
|
---|
302 | print INDEX_OUT "$line\n";
|
---|
303 | }
|
---|
304 | }
|
---|
305 |
|
---|
306 | #add new phrases to the index
|
---|
307 | foreach $phrase (keys %table){ #write 'phrase index:1:file ID
|
---|
308 | if($table{"$phrase"} ne "0"){
|
---|
309 | my $line = "$table{$phrase}:1:$ID";
|
---|
310 | print INDEX_OUT "$line\n";
|
---|
311 | }
|
---|
312 | }
|
---|
313 |
|
---|
314 | close(INDEX_OUT);
|
---|
315 |
|
---|
316 | }
|
---|
317 |
|
---|
318 | #This function is passed as arguments file hash ID and a list of kea phrases and/or stems
|
---|
319 | #that exist for that particular file and a list of the number of times each kea and/or stem
|
---|
320 | #phrase appear in that document. Its purpose is to write to the document_keyphrase
|
---|
321 | #index a line for the document it has been sent:
|
---|
322 | #-file ID:number of phrases and/or stems appear in the document
|
---|
323 | # |pairs of 'phrase index,number of times the phrase appears in the document'
|
---|
324 | sub document_keyphrase {
|
---|
325 |
|
---|
326 | my ($ID, $keaS, $kea_c) = @_;
|
---|
327 | my $text = "";
|
---|
328 | my @textlist;
|
---|
329 | my %phrases;
|
---|
330 | my $create_new_index = 0;
|
---|
331 |
|
---|
332 | print STDERR "writing to document_keyphrase.txt...\n";
|
---|
333 |
|
---|
334 | #split phrase counts into arrays
|
---|
335 | my @kea_counts = split(", ", $kea_c);
|
---|
336 |
|
---|
337 | #get table of phrases and phrase indexes
|
---|
338 | my %table = keyphrase_index_search($keaS);
|
---|
339 |
|
---|
340 | #split phrases into arrays
|
---|
341 | my @kea = split(", ", $keaS);
|
---|
342 |
|
---|
343 | #build new phrases dictionary
|
---|
344 | for($i=0; $i<=$#kea; $i++){
|
---|
345 | my $phrase = $table{"$kea[$i]"};
|
---|
346 | $phrases{"$phrase"} = "$kea_counts[$i]";
|
---|
347 | }
|
---|
348 |
|
---|
349 | my @num = keys %phrases;
|
---|
350 | my $phrasenum = $#num + 1; #number of phrases in doc
|
---|
351 |
|
---|
352 | #open index for reading
|
---|
353 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
|
---|
354 | or $create_new_index = 1;
|
---|
355 |
|
---|
356 |
|
---|
357 | if($create_new_index == 0){ #index doesn't need to be created
|
---|
358 |
|
---|
359 | while(<INDEX_IN>){
|
---|
360 | $text .= $_;
|
---|
361 | }
|
---|
362 |
|
---|
363 | close(INDEX_IN);
|
---|
364 |
|
---|
365 | #split text into lines
|
---|
366 | @textlist = split(/\n/, $text);
|
---|
367 |
|
---|
368 | }
|
---|
369 |
|
---|
370 |
|
---|
371 | #must write this line to the file
|
---|
372 | #'document ID:num of phrases|phrase index, number of times phrases appears
|
---|
373 | my $newline = "$ID:$phrasenum";
|
---|
374 | foreach $phrase (keys %phrases){
|
---|
375 | $newline .= "|$phrase,$phrases{$phrase}";
|
---|
376 | }
|
---|
377 |
|
---|
378 | #open index for output
|
---|
379 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
|
---|
380 |
|
---|
381 | if($create_new_index == 1){ #create a new index
|
---|
382 |
|
---|
383 | print INDEX_OUT "$newline\n";
|
---|
384 |
|
---|
385 | } else {
|
---|
386 |
|
---|
387 | #if ID is already in the file write line overtop incase
|
---|
388 | #someone has modified the file. Otherwise add the line
|
---|
389 | #to the end of the file
|
---|
390 | my $found = 0;
|
---|
391 |
|
---|
392 | foreach $line (@textlist){
|
---|
393 | if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
|
---|
394 | $id = $1;
|
---|
395 | if($ID eq $id){ #id is already in the file
|
---|
396 | print INDEX_OUT "$newline\n"; #print line overtop
|
---|
397 | $found = 1;
|
---|
398 | } else {
|
---|
399 | print INDEX_OUT "$line\n"; #print old line out
|
---|
400 | }
|
---|
401 | }
|
---|
402 | }
|
---|
403 |
|
---|
404 | print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
|
---|
405 |
|
---|
406 | }
|
---|
407 |
|
---|
408 | close(INDEX_OUT);
|
---|
409 |
|
---|
410 | }
|
---|
411 |
|
---|