1 | #! /user/bin/perl
|
---|
2 |
|
---|
3 | #usage: perl buildkpiS.pl [-R] [collection] [collection] etc
|
---|
4 | #
|
---|
5 | #-r or -R will remove previous index files so that you may build new ones
|
---|
6 | #
|
---|
7 | #The program performs the following tasks:
|
---|
8 | #-gathers the specified collections on the command line OR
|
---|
9 | #-gathers the directories of all the collections in the collect directory, this is all
|
---|
10 | # the directories apart from CVS, modelcol, . and .. which are not collections.
|
---|
11 | #-It then retrieves the archive.inf file from the archive directory of each collection
|
---|
12 | # to obtain the unique file ID and filepath of every document in the collection
|
---|
13 | #-Then parse through each doc.gml stored in filepath to gather information
|
---|
14 | #-From each file collect stems of kea keyphrases.
|
---|
15 | #-Determine the number of keyphrase stems for each document
|
---|
16 | #-Search for the keyphrase stems in the numbered phrase index. If a phrase is not there then
|
---|
17 | # the program will write the keyphrase stem to the phrase index .
|
---|
18 | #-Search for the keyphrase stem in the keyphrase to document index. If the stem is there,
|
---|
19 | # it will increment the number of documents that the keyphrase stem appears in and replace that
|
---|
20 | # then append the hash ID to the list of documents in the entry. If the stem is not there
|
---|
21 | # then the program will write the stem to the phrase index.
|
---|
22 | #-Then write document ID, no of phrase, phrase number from index followed by number of times
|
---|
23 | # phrase appears into the document_keyphrase index
|
---|
24 |
|
---|
25 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
26 | $collection;
|
---|
27 |
|
---|
28 | require "getopts.pl";
|
---|
29 | &Getopts('R'); #process option arguments
|
---|
30 |
|
---|
31 | #if option R remove all previous indexes
|
---|
32 | if($opt_R == 1){ #remove indexes
|
---|
33 | print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
|
---|
34 | print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
|
---|
35 | print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
|
---|
36 | system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
|
---|
37 | system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
|
---|
38 | system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
|
---|
39 | }
|
---|
40 |
|
---|
41 | #collections may be specified in the command line
|
---|
42 | #otherwise, all collections will be used to build
|
---|
43 | #the indexes.
|
---|
44 | if(@ARGV){
|
---|
45 |
|
---|
46 | @directories = @ARGV;
|
---|
47 |
|
---|
48 | } else { #open collect directory and get a list of all collections
|
---|
49 | opendir(DIR, "$gsdlhome/collect");
|
---|
50 | @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
|
---|
51 | closedir(DIR);
|
---|
52 | }
|
---|
53 |
|
---|
54 | #for each collection specified to build indexes for
|
---|
55 | foreach $collection (@directories){
|
---|
56 |
|
---|
57 | my @filelist;
|
---|
58 |
|
---|
59 | #archives.inf contains a list of unique hash ID's of each file and file paths
|
---|
60 | open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
|
---|
61 | or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
|
---|
62 |
|
---|
63 | while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
|
---|
64 | chomp;
|
---|
65 | push(@filelist, $_);
|
---|
66 | }
|
---|
67 |
|
---|
68 | foreach $file (@filelist){ #add each document to the indexes
|
---|
69 | build_index($file, $collection);
|
---|
70 | }
|
---|
71 | }
|
---|
72 |
|
---|
73 | #This function opens the file in the filepath sent as an argument. From this it obtains
|
---|
74 | #the kea and/or stem data, and then searches for these phrases in the file, counting and storing
|
---|
75 | #how many times each phrase appears. The data is then sent to function keyphrase_document
|
---|
76 | #with arguments hash ID, kea phrases and stem phrases to build the keyphrase_document index.
|
---|
77 | #The function which builds the document_keyphrase index is then passed the hash ID, the kea
|
---|
78 | #phrases and/or the stemmed phrases and the array/s which hold the number of times each phrase
|
---|
79 | #appears in the document so that the data it has collected can be written to document_ keyphrase
|
---|
80 | #index.
|
---|
81 |
|
---|
82 | sub build_index {
|
---|
83 |
|
---|
84 | my $args = shift(@_);
|
---|
85 | my $collection = shift(@_);
|
---|
86 | my ($ID, $filepath) = split(/\t/, $args);
|
---|
87 | my $stemsS = "";
|
---|
88 | my @stem_phrase_counts = 0;
|
---|
89 | my $text = "";
|
---|
90 |
|
---|
91 | print STDERR "\nID: $ID\n";
|
---|
92 | print STDERR "filepath: $filepath\n";
|
---|
93 |
|
---|
94 | #open file to extract keyphrase information
|
---|
95 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
96 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
97 |
|
---|
98 | #patterns to search for so that we can extract the kea information
|
---|
99 | my $stem_search = "stems=\"([^\"]*)\"";
|
---|
100 |
|
---|
101 | while(<FILE>){ #get kea and stem data and store
|
---|
102 | chomp;
|
---|
103 | $stemsS = $1 if (/$stem_search/);
|
---|
104 | }
|
---|
105 |
|
---|
106 | close(FILE);
|
---|
107 |
|
---|
108 | print STDERR "stems: $stemsS\n";
|
---|
109 |
|
---|
110 | my @stems = split(", ", $stemsS);
|
---|
111 |
|
---|
112 | if(@stems){ #if the data exists
|
---|
113 |
|
---|
114 | #open the filepath to the current document
|
---|
115 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
116 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
117 |
|
---|
118 | while(<FILE>){ #get the text
|
---|
119 | chomp;
|
---|
120 | $text .= $_;
|
---|
121 | }
|
---|
122 |
|
---|
123 | #chop out all things in angled brackets
|
---|
124 | $text =~ s/(<[^>]*>)//g;
|
---|
125 |
|
---|
126 | #initilise counts
|
---|
127 | for($i=0; $i<=$#stems; $i++){
|
---|
128 | $stem_phrase_counts[$i] = 0;
|
---|
129 | }
|
---|
130 |
|
---|
131 | #using regular expressions generated from stem-reg
|
---|
132 | #count how many of each phrase appear in the document
|
---|
133 |
|
---|
134 | $text_copy = $text;
|
---|
135 | for($i=0; $i<=$#stems; $i++){ #search for text with stem phrases
|
---|
136 | my $stem = $stems[$i];
|
---|
137 | $reg = &stem_reg(split(/\s+/, $stem));
|
---|
138 | while($text_copy =~ s/$reg//i){
|
---|
139 | $stem_phrase_counts[$i]++; #count the number of stem phrases
|
---|
140 | }
|
---|
141 | $text_copy = $text;
|
---|
142 | }
|
---|
143 |
|
---|
144 |
|
---|
145 | #write data to keyphrase_document index
|
---|
146 | &keyphrase_document($ID, $stemsS);
|
---|
147 |
|
---|
148 | #write data to document_keyphrase index
|
---|
149 | $stem_counts = join(", ", @stem_phrase_counts);
|
---|
150 | &document_keyphrase($ID, $stemsS, $stem_counts);
|
---|
151 |
|
---|
152 | } else {
|
---|
153 | print STDERR "No stem data was found in file $filepath\n";
|
---|
154 | }
|
---|
155 |
|
---|
156 | }
|
---|
157 |
|
---|
158 | #returns a regular expression designed to
|
---|
159 | #search for stems in text
|
---|
160 | #eg 'agri cari'
|
---|
161 | # agri followed by 0 or more non-whitespace characters
|
---|
162 | # followed by one or more whitespace OR 0 or 1 non-whitespace characters
|
---|
163 | # cari followed by 0 or more non-whitespace characters
|
---|
164 | #modified from original by Stephen Lundy
|
---|
165 |
|
---|
166 | sub stem_reg {
|
---|
167 |
|
---|
168 | $regexp = "";
|
---|
169 |
|
---|
170 | $l = @_;
|
---|
171 |
|
---|
172 | if ($l > 0) {
|
---|
173 | $s = shift;
|
---|
174 | $regexp = "$s\\S*";
|
---|
175 |
|
---|
176 | if ($l-1 > 0) {
|
---|
177 | foreach $s (@_) {
|
---|
178 | $regexp .= "(\\s+|\\S?)$s\\S*";
|
---|
179 | }
|
---|
180 | }
|
---|
181 | }
|
---|
182 |
|
---|
183 | return $regexp;
|
---|
184 | }
|
---|
185 |
|
---|
186 |
|
---|
187 |
|
---|
188 | #This function is passed as arguments a list of kea phrase stems. Its purpose is to
|
---|
189 | #check in the keyphrase index file for each phrase and determine whether or not an entry has
|
---|
190 | #been made for that phrase and an index number assigned to it. If there has not been an entry
|
---|
191 | #made then an index number is assigned to the phrase and it is written to the file. This
|
---|
192 | #function is called by document_keyphrase and keyphrase_document. Each line in the file has
|
---|
193 | #this form:
|
---|
194 | #-phrase index number:phrase
|
---|
195 | #This function then returns a table of pairs of the phrases that were sent as arguments to it
|
---|
196 | #{phrase => phrase index number}.
|
---|
197 |
|
---|
198 | sub keyphrase_index_search {
|
---|
199 |
|
---|
200 | my $phrases = shift(@_);
|
---|
201 | my @phrases = split(", ", $phrases);
|
---|
202 | my %table;
|
---|
203 | my $index = 1;
|
---|
204 | my $create_new_index = 0;
|
---|
205 |
|
---|
206 | print STDERR "searching keyphrase index...\n";
|
---|
207 |
|
---|
208 | #initilise table of phrases and index numbers
|
---|
209 | foreach $phrase (@phrases){
|
---|
210 | $table{"$phrase"} = "0";
|
---|
211 | }
|
---|
212 |
|
---|
213 | #open keyphrase index for appending data and for reading
|
---|
214 | open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
|
---|
215 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
|
---|
216 | or $create_new_index = 1;
|
---|
217 |
|
---|
218 | if($create_new_index == 0){
|
---|
219 | #if the index already exists read in the phrases
|
---|
220 | while(<INDEX_IN>){
|
---|
221 | chomp;
|
---|
222 | foreach $phrase (@phrases){
|
---|
223 | if(/(\d+):$phrase/){
|
---|
224 | $index = $1;
|
---|
225 | $table{"$phrase"} = "$index";
|
---|
226 | }
|
---|
227 | }
|
---|
228 | $index++; #new starting index (one + the last index)
|
---|
229 | }
|
---|
230 |
|
---|
231 | close(INDEX_IN);
|
---|
232 |
|
---|
233 | }
|
---|
234 |
|
---|
235 | #add new phrases to the phrase index
|
---|
236 | foreach $phrase (keys %table){
|
---|
237 | if($table{"$phrase"} eq "0"){
|
---|
238 | print INDEX_OUT "$index:$phrase\n";
|
---|
239 | $table{"$phrase"} = "$index";
|
---|
240 | $index++;
|
---|
241 | }
|
---|
242 | }
|
---|
243 |
|
---|
244 | close(INDEX_OUT);
|
---|
245 | return %table;
|
---|
246 | }
|
---|
247 |
|
---|
248 | #This function is passed as arguments file hash ID and a list of kea phrases and/or stems
|
---|
249 | #that exist for that particular file. Its purpose is to write to the keyphrase_document
|
---|
250 | #index a line for the document it has been sent:
|
---|
251 | #-phrase index number:number of documents it appears in|ID
|
---|
252 | sub keyphrase_document{
|
---|
253 |
|
---|
254 | my ($ID, $stems) = @_;
|
---|
255 | my $text = "";
|
---|
256 | my @textlist;
|
---|
257 | my $create_new_index = 0;
|
---|
258 |
|
---|
259 | print STDERR "writing to keyphrase_document.txt...\n";
|
---|
260 |
|
---|
261 | #get table of phrases and phrase indexes
|
---|
262 | my %table = keyphrase_index_search($stems);
|
---|
263 |
|
---|
264 |
|
---|
265 | #open index for reading
|
---|
266 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
|
---|
267 | or $create_new_index = 1;
|
---|
268 |
|
---|
269 | #read in document if file exists
|
---|
270 | if($create_new_index == 0){
|
---|
271 |
|
---|
272 | while(<INDEX_IN>){
|
---|
273 | $text .= $_;
|
---|
274 | }
|
---|
275 |
|
---|
276 | close(INDEX_IN);
|
---|
277 |
|
---|
278 | #split text into lines
|
---|
279 | @textlist = split(/\n/, $text);
|
---|
280 |
|
---|
281 | }
|
---|
282 |
|
---|
283 | #open index for output
|
---|
284 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
|
---|
285 |
|
---|
286 | if($create_new_index == 0){ #amend existing index
|
---|
287 |
|
---|
288 | foreach $line (@textlist){
|
---|
289 | foreach $phrase (keys %table){
|
---|
290 | if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
|
---|
291 | $index = $1;
|
---|
292 | if($table{"$phrase"} eq "$index") { #if phrase exists in index
|
---|
293 | $ids = $3; #get all doc IDs for that keyphrase
|
---|
294 | if($ids !~ /$ID/){ #if doc ID not already included
|
---|
295 | $num_docs = $2;
|
---|
296 | $num_docs++; #increment number of docs
|
---|
297 | $line = "$index:$num_docs$3|$ID"; #line to append to index
|
---|
298 | $table{"$phrase"} = "0";
|
---|
299 | }
|
---|
300 | }
|
---|
301 | }
|
---|
302 | }
|
---|
303 | print INDEX_OUT "$line\n";
|
---|
304 | }
|
---|
305 | }
|
---|
306 |
|
---|
307 | #add new phrases to the index
|
---|
308 | foreach $phrase (keys %table){ #write 'phrase index:1:file ID
|
---|
309 | if($table{"$phrase"} ne "0"){
|
---|
310 | my $line = "$table{$phrase}:1:$ID";
|
---|
311 | print INDEX_OUT "$line\n";
|
---|
312 | }
|
---|
313 | }
|
---|
314 |
|
---|
315 | close(INDEX_OUT);
|
---|
316 |
|
---|
317 | }
|
---|
318 |
|
---|
319 | #This function is passed as arguments file hash ID and a list of kea phrase stems
|
---|
320 | #that exist for that particular file and a list of the number of times each stem
|
---|
321 | #phrase appear in that document. Its purpose is to write to the document_keyphrase
|
---|
322 | #index a line for the document it has been sent:
|
---|
323 | #-file ID:number of phrases and/or stems appear in the document
|
---|
324 | # |pairs of 'phrase index,number of times the phrase appears in the document'
|
---|
325 | sub document_keyphrase {
|
---|
326 |
|
---|
327 | my ($ID, $stemsS, $stem_c) = @_;
|
---|
328 | my $text = "";
|
---|
329 | my @textlist;
|
---|
330 | my %phrases;
|
---|
331 | my $create_new_index = 0;
|
---|
332 |
|
---|
333 | print STDERR "writing to document_keyphrase.txt...\n";
|
---|
334 |
|
---|
335 | #split phrase counts into arrays
|
---|
336 | my @stem_counts = split(", ", $stem_c);
|
---|
337 |
|
---|
338 | #get table of phrases and phrase indexes
|
---|
339 | my %table = keyphrase_index_search($stemsS);
|
---|
340 |
|
---|
341 | #split phrases into arrays
|
---|
342 | my @stems = split(", ", $stemsS);
|
---|
343 |
|
---|
344 | #build new phrases dictionary
|
---|
345 | for($i=0; $i<=$#stems; $i++){
|
---|
346 | my $phrase = $table{"$stems[$i]"};
|
---|
347 | $phrases{"$phrase"} = "$stem_counts[$i]";
|
---|
348 | }
|
---|
349 | my @num = keys %phrases;
|
---|
350 | my $phrasenum = $#num + 1; #number of phrases in doc
|
---|
351 |
|
---|
352 | #open index for reading
|
---|
353 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
|
---|
354 | or $create_new_index = 1;
|
---|
355 |
|
---|
356 |
|
---|
357 | if($create_new_index == 0){ #index doesn't need to be created
|
---|
358 |
|
---|
359 | while(<INDEX_IN>){
|
---|
360 | $text .= $_;
|
---|
361 | }
|
---|
362 |
|
---|
363 | close(INDEX_IN);
|
---|
364 |
|
---|
365 | #split text into lines
|
---|
366 | @textlist = split(/\n/, $text);
|
---|
367 |
|
---|
368 | }
|
---|
369 |
|
---|
370 |
|
---|
371 | #must write this line to the file
|
---|
372 | #'document ID:num of phrases|phrase index, number of times phrases appears
|
---|
373 | my $newline = "$ID:$phrasenum";
|
---|
374 | foreach $phrase (keys %phrases){
|
---|
375 | $newline .= "|$phrase,$phrases{$phrase}";
|
---|
376 | }
|
---|
377 |
|
---|
378 | #open index for output
|
---|
379 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
|
---|
380 |
|
---|
381 | if($create_new_index == 1){ #create a new index
|
---|
382 |
|
---|
383 | print INDEX_OUT "$newline\n";
|
---|
384 |
|
---|
385 | } else {
|
---|
386 |
|
---|
387 | #if ID is already in the file write line overtop incase
|
---|
388 | #someone has modified the file. Otherwise add the line
|
---|
389 | #to the end of the file
|
---|
390 | my $found = 0;
|
---|
391 |
|
---|
392 | foreach $line (@textlist){
|
---|
393 | if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
|
---|
394 | $id = $1;
|
---|
395 | if($ID eq $id){ #id is already in the file
|
---|
396 | print INDEX_OUT "$newline\n"; #print line overtop
|
---|
397 | $found = 1;
|
---|
398 | } else {
|
---|
399 | print INDEX_OUT "$line\n"; #print old line out
|
---|
400 | }
|
---|
401 | }
|
---|
402 | }
|
---|
403 |
|
---|
404 | print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
|
---|
405 |
|
---|
406 | }
|
---|
407 |
|
---|
408 | close(INDEX_OUT);
|
---|
409 |
|
---|
410 | }
|
---|
411 |
|
---|
412 |
|
---|
413 |
|
---|
414 |
|
---|
415 |
|
---|
416 |
|
---|
417 |
|
---|
418 |
|
---|
419 |
|
---|
420 |
|
---|