source: trunk/gsdl/bin/script/indexes/buildkpiS.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
Line 
1#! /user/bin/perl
2
3#usage: perl buildkpiS.pl [-R] [collection] [collection] etc
4#
5#-r or -R will remove previous index files so that you may build new ones
6#
7#The program performs the following tasks:
8#-gathers the specified collections on the command line OR
9#-gathers the directories of all the collections in the collect directory, this is all
10# the directories apart from CVS, modelcol, . and .. which are not collections.
11#-It then retrieves the archive.inf file from the archive directory of each collection
12# to obtain the unique file ID and filepath of every document in the collection
13#-Then parse through each doc.gml stored in filepath to gather information
14#-From each file collect stems of kea keyphrases.
15#-Determine the number of keyphrase stems for each document
16#-Search for the keyphrase stems in the numbered phrase index. If a phrase is not there then
17# the program will write the keyphrase stem to the phrase index .
18#-Search for the keyphrase stem in the keyphrase to document index. If the stem is there,
19# it will increment the number of documents that the keyphrase stem appears in and replace that
20# then append the hash ID to the list of documents in the entry. If the stem is not there
21# then the program will write the stem to the phrase index.
22#-Then write document ID, no of phrase, phrase number from index followed by number of times
23# phrase appears into the document_keyphrase index
24
25$gsdlhome = $ENV{'GSDLHOME'};
26$collection;
27
28require "getopts.pl";
29&Getopts('R'); #process option arguments
30
31#if option R remove all previous indexes
32if($opt_R == 1){ #remove indexes
33 print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
34 print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
35 print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
36 system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
37 system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
38 system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
39}
40
41#collections may be specified in the command line
42#otherwise, all collections will be used to build
43#the indexes.
44if(@ARGV){
45
46 @directories = @ARGV;
47
48} else { #open collect directory and get a list of all collections
49 opendir(DIR, "$gsdlhome/collect");
50 @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
51 closedir(DIR);
52}
53
54#for each collection specified to build indexes for
55foreach $collection (@directories){
56
57 my @filelist;
58
59 #archives.inf contains a list of unique hash ID's of each file and file paths
60 open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
61 or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
62
63 while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
64 chomp;
65 push(@filelist, $_);
66 }
67
68 foreach $file (@filelist){ #add each document to the indexes
69 build_index($file, $collection);
70 }
71}
72
73#This function opens the file in the filepath sent as an argument. From this it obtains
74#the kea and/or stem data, and then searches for these phrases in the file, counting and storing
75#how many times each phrase appears. The data is then sent to function keyphrase_document
76#with arguments hash ID, kea phrases and stem phrases to build the keyphrase_document index.
77#The function which builds the document_keyphrase index is then passed the hash ID, the kea
78#phrases and/or the stemmed phrases and the array/s which hold the number of times each phrase
79#appears in the document so that the data it has collected can be written to document_ keyphrase
80#index.
81
82sub build_index {
83
84 my $args = shift(@_);
85 my $collection = shift(@_);
86 my ($ID, $filepath) = split(/\t/, $args);
87 my $stemsS = "";
88 my @stem_phrase_counts = 0;
89 my $text = "";
90
91 print STDERR "\nID: $ID\n";
92 print STDERR "filepath: $filepath\n";
93
94 #open file to extract keyphrase information
95 open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
96 or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
97
98 #patterns to search for so that we can extract the kea information
99 my $stem_search = "stems=\"([^\"]*)\"";
100
101 while(<FILE>){ #get kea and stem data and store
102 chomp;
103 $stemsS = $1 if (/$stem_search/);
104 }
105
106 close(FILE);
107
108 print STDERR "stems: $stemsS\n";
109
110 my @stems = split(", ", $stemsS);
111
112 if(@stems){ #if the data exists
113
114 #open the filepath to the current document
115 open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
116 or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
117
118 while(<FILE>){ #get the text
119 chomp;
120 $text .= $_;
121 }
122
123 #chop out all things in angled brackets
124 $text =~ s/(<[^>]*>)//g;
125
126 #initilise counts
127 for($i=0; $i<=$#stems; $i++){
128 $stem_phrase_counts[$i] = 0;
129 }
130
131 #using regular expressions generated from stem-reg
132 #count how many of each phrase appear in the document
133
134 $text_copy = $text;
135 for($i=0; $i<=$#stems; $i++){ #search for text with stem phrases
136 my $stem = $stems[$i];
137 $reg = &stem_reg(split(/\s+/, $stem));
138 while($text_copy =~ s/$reg//i){
139 $stem_phrase_counts[$i]++; #count the number of stem phrases
140 }
141 $text_copy = $text;
142 }
143
144
145 #write data to keyphrase_document index
146 &keyphrase_document($ID, $stemsS);
147
148 #write data to document_keyphrase index
149 $stem_counts = join(", ", @stem_phrase_counts);
150 &document_keyphrase($ID, $stemsS, $stem_counts);
151
152 } else {
153 print STDERR "No stem data was found in file $filepath\n";
154 }
155
156}
157
158#returns a regular expression designed to
159#search for stems in text
160#eg 'agri cari'
161# agri followed by 0 or more non-whitespace characters
162# followed by one or more whitespace OR 0 or 1 non-whitespace characters
163# cari followed by 0 or more non-whitespace characters
164#modified from original by Stephen Lundy
165
166sub stem_reg {
167
168 $regexp = "";
169
170 $l = @_;
171
172 if ($l > 0) {
173 $s = shift;
174 $regexp = "$s\\S*";
175
176 if ($l-1 > 0) {
177 foreach $s (@_) {
178 $regexp .= "(\\s+|\\S?)$s\\S*";
179 }
180 }
181 }
182
183 return $regexp;
184}
185
186
187
188#This function is passed as arguments a list of kea phrase stems. Its purpose is to
189#check in the keyphrase index file for each phrase and determine whether or not an entry has
190#been made for that phrase and an index number assigned to it. If there has not been an entry
191#made then an index number is assigned to the phrase and it is written to the file. This
192#function is called by document_keyphrase and keyphrase_document. Each line in the file has
193#this form:
194#-phrase index number:phrase
195#This function then returns a table of pairs of the phrases that were sent as arguments to it
196#{phrase => phrase index number}.
197
198sub keyphrase_index_search {
199
200 my $phrases = shift(@_);
201 my @phrases = split(", ", $phrases);
202 my %table;
203 my $index = 1;
204 my $create_new_index = 0;
205
206 print STDERR "searching keyphrase index...\n";
207
208 #initilise table of phrases and index numbers
209 foreach $phrase (@phrases){
210 $table{"$phrase"} = "0";
211 }
212
213 #open keyphrase index for appending data and for reading
214 open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
215 open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
216 or $create_new_index = 1;
217
218 if($create_new_index == 0){
219 #if the index already exists read in the phrases
220 while(<INDEX_IN>){
221 chomp;
222 foreach $phrase (@phrases){
223 if(/(\d+):$phrase/){
224 $index = $1;
225 $table{"$phrase"} = "$index";
226 }
227 }
228 $index++; #new starting index (one + the last index)
229 }
230
231 close(INDEX_IN);
232
233 }
234
235 #add new phrases to the phrase index
236 foreach $phrase (keys %table){
237 if($table{"$phrase"} eq "0"){
238 print INDEX_OUT "$index:$phrase\n";
239 $table{"$phrase"} = "$index";
240 $index++;
241 }
242 }
243
244 close(INDEX_OUT);
245 return %table;
246}
247
248#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
249#that exist for that particular file. Its purpose is to write to the keyphrase_document
250#index a line for the document it has been sent:
251#-phrase index number:number of documents it appears in|ID
252sub keyphrase_document{
253
254 my ($ID, $stems) = @_;
255 my $text = "";
256 my @textlist;
257 my $create_new_index = 0;
258
259 print STDERR "writing to keyphrase_document.txt...\n";
260
261 #get table of phrases and phrase indexes
262 my %table = keyphrase_index_search($stems);
263
264
265 #open index for reading
266 open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
267 or $create_new_index = 1;
268
269 #read in document if file exists
270 if($create_new_index == 0){
271
272 while(<INDEX_IN>){
273 $text .= $_;
274 }
275
276 close(INDEX_IN);
277
278 #split text into lines
279 @textlist = split(/\n/, $text);
280
281 }
282
283 #open index for output
284 open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
285
286 if($create_new_index == 0){ #amend existing index
287
288 foreach $line (@textlist){
289 foreach $phrase (keys %table){
290 if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
291 $index = $1;
292 if($table{"$phrase"} eq "$index") { #if phrase exists in index
293 $ids = $3; #get all doc IDs for that keyphrase
294 if($ids !~ /$ID/){ #if doc ID not already included
295 $num_docs = $2;
296 $num_docs++; #increment number of docs
297 $line = "$index:$num_docs$3|$ID"; #line to append to index
298 $table{"$phrase"} = "0";
299 }
300 }
301 }
302 }
303 print INDEX_OUT "$line\n";
304 }
305 }
306
307 #add new phrases to the index
308 foreach $phrase (keys %table){ #write 'phrase index:1:file ID
309 if($table{"$phrase"} ne "0"){
310 my $line = "$table{$phrase}:1:$ID";
311 print INDEX_OUT "$line\n";
312 }
313 }
314
315 close(INDEX_OUT);
316
317}
318
319#This function is passed as arguments file hash ID and a list of kea phrase stems
320#that exist for that particular file and a list of the number of times each stem
321#phrase appear in that document. Its purpose is to write to the document_keyphrase
322#index a line for the document it has been sent:
323#-file ID:number of phrases and/or stems appear in the document
324# |pairs of 'phrase index,number of times the phrase appears in the document'
325sub document_keyphrase {
326
327 my ($ID, $stemsS, $stem_c) = @_;
328 my $text = "";
329 my @textlist;
330 my %phrases;
331 my $create_new_index = 0;
332
333 print STDERR "writing to document_keyphrase.txt...\n";
334
335 #split phrase counts into arrays
336 my @stem_counts = split(", ", $stem_c);
337
338 #get table of phrases and phrase indexes
339 my %table = keyphrase_index_search($stemsS);
340
341 #split phrases into arrays
342 my @stems = split(", ", $stemsS);
343
344 #build new phrases dictionary
345 for($i=0; $i<=$#stems; $i++){
346 my $phrase = $table{"$stems[$i]"};
347 $phrases{"$phrase"} = "$stem_counts[$i]";
348 }
349 my @num = keys %phrases;
350 my $phrasenum = $#num + 1; #number of phrases in doc
351
352 #open index for reading
353 open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
354 or $create_new_index = 1;
355
356
357 if($create_new_index == 0){ #index doesn't need to be created
358
359 while(<INDEX_IN>){
360 $text .= $_;
361 }
362
363 close(INDEX_IN);
364
365 #split text into lines
366 @textlist = split(/\n/, $text);
367
368 }
369
370
371 #must write this line to the file
372 #'document ID:num of phrases|phrase index, number of times phrases appears
373 my $newline = "$ID:$phrasenum";
374 foreach $phrase (keys %phrases){
375 $newline .= "|$phrase,$phrases{$phrase}";
376 }
377
378 #open index for output
379 open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
380
381 if($create_new_index == 1){ #create a new index
382
383 print INDEX_OUT "$newline\n";
384
385 } else {
386
387 #if ID is already in the file write line overtop incase
388 #someone has modified the file. Otherwise add the line
389 #to the end of the file
390 my $found = 0;
391
392 foreach $line (@textlist){
393 if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
394 $id = $1;
395 if($ID eq $id){ #id is already in the file
396 print INDEX_OUT "$newline\n"; #print line overtop
397 $found = 1;
398 } else {
399 print INDEX_OUT "$line\n"; #print old line out
400 }
401 }
402 }
403
404 print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
405
406 }
407
408 close(INDEX_OUT);
409
410}
411
412
413
414
415
416
417
418
419
420
Note: See TracBrowser for help on using the repository browser.