[1971] | 1 | #! /user/bin/perl
|
---|
| 2 |
|
---|
| 3 | #usage: perl buildkpiK.pl [-R] [collection] [collection] etc
|
---|
| 4 | #
|
---|
| 5 | #-r or -R will remove previous index files so that you may build new ones
|
---|
| 6 | #
|
---|
| 7 | #The program performs the following tasks:
|
---|
| 8 | #-gathers the specified collections on the command line OR
|
---|
| 9 | #-gathers the directories of all the collections in the collect directory, this is all
|
---|
| 10 | # the directories apart from CVS, modelcol, . and .. which are not collections.
|
---|
| 11 | #-It then retrieves the archive.inf file from the archive directory of each collection
|
---|
| 12 | # to obtain the unique file ID and filepath of every document in the collection
|
---|
| 13 | #-Then parse through each doc.gml stored in filepath to gather information
|
---|
| 14 | #-From each file collect kea phrases.
|
---|
| 15 | #-Determine the number of kea phrases for each document
|
---|
| 16 | #-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
|
---|
| 17 | # the program will write the kea phrase to the phrase index .
|
---|
| 18 | #-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
|
---|
| 19 | # it will increment the number of documents that the keyphrase appears in and replace that
|
---|
| 20 | # then append the hash ID to the list of documents in the entry. If the phrase is not there
|
---|
| 21 | # then the program will write the kea phrase to the phrase index.
|
---|
| 22 | #-Then write document ID, no of phrase, phrase number from index followed by number of times
|
---|
| 23 | # phrase appears into the document_keyphrase index
|
---|
| 24 |
|
---|
| 25 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
| 26 | $collection;
|
---|
| 27 |
|
---|
| 28 | require "getopts.pl";
|
---|
| 29 | &Getopts('R'); #process option arguments
|
---|
| 30 |
|
---|
| 31 | #if option R remove all previous indexes
|
---|
| 32 | if($opt_R == 1){ #remove indexes
|
---|
| 33 | print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
|
---|
| 34 | print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
|
---|
| 35 | print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
|
---|
| 36 | system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
|
---|
| 37 | system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
|
---|
| 38 | system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
|
---|
| 39 | }
|
---|
| 40 |
|
---|
| 41 | #collections may be specified in the command line
|
---|
| 42 | #otherwise, all collections will be used to build
|
---|
| 43 | #the indexes.
|
---|
| 44 | if(@ARGV){
|
---|
| 45 |
|
---|
| 46 | @directories = @ARGV;
|
---|
| 47 |
|
---|
| 48 | } else { #open collect directory and get a list of all collections
|
---|
| 49 | opendir(DIR, "$gsdlhome/collect");
|
---|
| 50 | @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
|
---|
| 51 | closedir(DIR);
|
---|
| 52 | }
|
---|
| 53 |
|
---|
| 54 | #for each collection specified to build indexes for
|
---|
| 55 | foreach $collection (@directories){
|
---|
| 56 |
|
---|
| 57 | my @filelist;
|
---|
| 58 |
|
---|
| 59 | #archives.inf contains a list of unique hash ID's of each file and file paths
|
---|
| 60 | open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
|
---|
| 61 | or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
|
---|
| 62 |
|
---|
| 63 | while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
|
---|
| 64 | chomp;
|
---|
| 65 | push(@filelist, $_);
|
---|
| 66 | }
|
---|
| 67 |
|
---|
| 68 | foreach $file (@filelist){ #add each document to the indexes
|
---|
| 69 | build_index($file, $collection);
|
---|
| 70 | }
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 | #This function opens the file in the filepath sent as an argument. From this it obtains
|
---|
| 74 | #the kea data, and then searches for these phrases in the file, counting and storing
|
---|
| 75 | #how many times each phrase appears. The data is then sent to function keyphrase_document
|
---|
| 76 | #with arguments hash ID, kea phrases to build the keyphrase_document index.
|
---|
| 77 | #The function which builds the document_keyphrase index is then passed the hash ID, the kea
|
---|
| 78 | #phrases and the array/s which hold the number of times each phrase
|
---|
| 79 | #appears in the document so that the data it has collected can be written to document_ keyphrase
|
---|
| 80 | #index.
|
---|
| 81 |
|
---|
| 82 | sub build_index {
|
---|
| 83 |
|
---|
| 84 | my $args = shift(@_);
|
---|
| 85 | my $collection = shift(@_);
|
---|
| 86 | my ($ID, $filepath) = split(/\t/, $args);
|
---|
| 87 | my $keaS = "";
|
---|
| 88 | my @kea_phrase_counts = 0;
|
---|
| 89 | my $text = "";
|
---|
| 90 |
|
---|
| 91 | print STDERR "\nID: $ID\n";
|
---|
| 92 | print STDERR "filepath: $filepath\n";
|
---|
| 93 |
|
---|
| 94 | #open file to extract keyphrase information
|
---|
| 95 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
| 96 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
| 97 |
|
---|
| 98 | #patterns to search for so that we can extract the kea information
|
---|
| 99 | my $kea_search = ".* kea=\"([^\"]*)\"";
|
---|
| 100 |
|
---|
| 101 | while(<FILE>){ #get kea and stem data and store
|
---|
| 102 | chomp;
|
---|
| 103 | $keaS = $1 if (/$kea_search/);
|
---|
| 104 | }
|
---|
| 105 |
|
---|
| 106 | close(FILE);
|
---|
| 107 |
|
---|
| 108 | print STDERR "Kea: $keaS\n";
|
---|
| 109 |
|
---|
| 110 | my @kea = split(", ", $keaS);
|
---|
| 111 |
|
---|
| 112 | if(@kea){ #if the data exists
|
---|
| 113 |
|
---|
| 114 | #open the filepath to the current document
|
---|
| 115 | open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
|
---|
| 116 | or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
|
---|
| 117 |
|
---|
| 118 | while(<FILE>){ #get the text
|
---|
| 119 | chomp;
|
---|
| 120 | $text .= $_;
|
---|
| 121 | }
|
---|
| 122 |
|
---|
| 123 | #chop out all things in angled brackets
|
---|
| 124 | $text =~ s/(<[^>]*>)//g;
|
---|
| 125 |
|
---|
| 126 | #initilise counts
|
---|
| 127 | for($i=0; $i<=$#kea; $i++){
|
---|
| 128 | $kea_phrase_counts[$i] = 0;
|
---|
| 129 | }
|
---|
| 130 |
|
---|
| 131 |
|
---|
| 132 | #using regular expressions generated from kea-reg
|
---|
| 133 | #count how many of each phrase appear in the document
|
---|
| 134 | $text_copy = $text;
|
---|
| 135 | for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
|
---|
| 136 | my $phrase = $kea[$i];
|
---|
| 137 | $reg = &kea_reg(split(/\s+/, $phrase));
|
---|
| 138 | while($text_copy =~ s/$reg//i){
|
---|
| 139 | $kea_phrase_counts[$i]++; #count the number of kea phrases
|
---|
| 140 | }
|
---|
| 141 | $text_copy = $text;
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 | #write data to keyphrase_document index
|
---|
| 145 | &keyphrase_document($ID, $keaS);
|
---|
| 146 |
|
---|
| 147 | #write data to document_keyphrase index
|
---|
| 148 | $kea_counts = join(", ", @kea_phrase_counts);
|
---|
| 149 | &document_keyphrase($ID, $keaS, $kea_counts);
|
---|
| 150 |
|
---|
| 151 | } else {
|
---|
| 152 | print STDERR "No kea data was found in file $filepath\n";
|
---|
| 153 | }
|
---|
| 154 |
|
---|
| 155 | }
|
---|
| 156 |
|
---|
| 157 |
|
---|
| 158 | #returns a regular expression designed to
|
---|
| 159 | #search for phrases in text
|
---|
| 160 | #eg 'agris caris'
|
---|
| 161 | # agris followed by 0 or 1 non-whitespace characters OR
|
---|
| 162 | # followed by one or more whitespace
|
---|
| 163 | # caris followed by 0 or 1 non-whitespace characters
|
---|
| 164 | #modified from original by Stephen Lundy
|
---|
| 165 |
|
---|
| 166 | sub kea_reg {
|
---|
| 167 | $regexp = "";
|
---|
| 168 |
|
---|
| 169 | $l = @_;
|
---|
| 170 |
|
---|
| 171 | if ($l > 0) {
|
---|
| 172 | $s = shift;
|
---|
| 173 | #$regexp = "$s(\\S?)";
|
---|
| 174 | $regexp = "$s(\\s+|\\S?)";
|
---|
| 175 |
|
---|
| 176 | if ($l-1 > 0) {
|
---|
| 177 | foreach $s (@_) {
|
---|
| 178 | $regexp .= "$s(\\s+|\\S?)";
|
---|
| 179 | }
|
---|
| 180 | }
|
---|
| 181 | }
|
---|
| 182 |
|
---|
| 183 | return $regexp;
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 |
|
---|
| 187 | #This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
|
---|
| 188 | #check in the keyphrase index file for each phrase and determine whether or not an entry has
|
---|
| 189 | #been made for that phrase and an index number assigned to it. If there has not been an entry
|
---|
| 190 | #made then an index number is assigned to the phrase and it is written to the file. This
|
---|
| 191 | #function is called by document_keyphrase and keyphrase_document. Each line in the file has
|
---|
| 192 | #this form:
|
---|
| 193 | #-phrase index number:phrase
|
---|
| 194 | #This function then returns a table of pairs of the phrases that were sent as arguments to it
|
---|
| 195 | #{phrase => phrase index number}.
|
---|
| 196 |
|
---|
| 197 | sub keyphrase_index_search {
|
---|
| 198 |
|
---|
| 199 | my $phrases = shift(@_);
|
---|
| 200 | my @phrases = split(", ", $phrases);
|
---|
| 201 | my %table;
|
---|
| 202 | my $index = 1;
|
---|
| 203 | my $create_new_index = 0;
|
---|
| 204 |
|
---|
| 205 | print STDERR "searching keyphrase index...\n";
|
---|
| 206 |
|
---|
| 207 | #initilise table of phrases and index numbers
|
---|
| 208 | foreach $phrase (@phrases){
|
---|
| 209 | $table{"$phrase"} = "0";
|
---|
| 210 | }
|
---|
| 211 |
|
---|
| 212 | #open keyphrase index for appending data and for reading
|
---|
| 213 | open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
|
---|
| 214 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
|
---|
| 215 | or $create_new_index = 1;
|
---|
| 216 |
|
---|
| 217 | if($create_new_index == 0){
|
---|
| 218 | #if the index already exists read in the phrases
|
---|
| 219 | while(<INDEX_IN>){
|
---|
| 220 | chomp;
|
---|
| 221 | foreach $phrase (@phrases){
|
---|
| 222 | if(/(\d+):$phrase/){
|
---|
| 223 | $index = $1;
|
---|
| 224 | $table{"$phrase"} = "$index";
|
---|
| 225 | }
|
---|
| 226 | }
|
---|
| 227 | $index++; #new starting index (one + the last index)
|
---|
| 228 | }
|
---|
| 229 |
|
---|
| 230 | close(INDEX_IN);
|
---|
| 231 |
|
---|
| 232 | }
|
---|
| 233 |
|
---|
| 234 | #add new phrases to the phrase index
|
---|
| 235 | foreach $phrase (keys %table){
|
---|
| 236 | if($table{"$phrase"} eq "0"){
|
---|
| 237 | print INDEX_OUT "$index:$phrase\n";
|
---|
| 238 | $table{"$phrase"} = "$index";
|
---|
| 239 | $index++;
|
---|
| 240 | }
|
---|
| 241 | }
|
---|
| 242 |
|
---|
| 243 | close(INDEX_OUT);
|
---|
| 244 | return %table;
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 | #This function is passed as arguments file hash ID and a list of kea phrases
|
---|
| 248 | #that exist for that particular file. Its purpose is to write to the keyphrase_document
|
---|
| 249 | #index a line for the document it has been sent:
|
---|
| 250 | #-phrase index number:number of documents it appears in|ID
|
---|
| 251 | sub keyphrase_document{
|
---|
| 252 |
|
---|
| 253 | my ($ID, $kea) = @_;
|
---|
| 254 | my $text = "";
|
---|
| 255 | my @textlist;
|
---|
| 256 | my $create_new_index = 0;
|
---|
| 257 |
|
---|
| 258 | print STDERR "writing to keyphrase_document.txt...\n";
|
---|
| 259 |
|
---|
| 260 | #get table of phrases and phrase indexes
|
---|
| 261 | my %table = keyphrase_index_search($kea);
|
---|
| 262 |
|
---|
| 263 |
|
---|
| 264 | #open index for reading
|
---|
| 265 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
|
---|
| 266 | or $create_new_index = 1;
|
---|
| 267 |
|
---|
| 268 | #read in document if file exists
|
---|
| 269 | if($create_new_index == 0){
|
---|
| 270 |
|
---|
| 271 | while(<INDEX_IN>){
|
---|
| 272 | $text .= $_;
|
---|
| 273 | }
|
---|
| 274 |
|
---|
| 275 | close(INDEX_IN);
|
---|
| 276 |
|
---|
| 277 | #split text into lines
|
---|
| 278 | @textlist = split(/\n/, $text);
|
---|
| 279 |
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | #open index for output
|
---|
| 283 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
|
---|
| 284 |
|
---|
| 285 | if($create_new_index == 0){ #amend existing index
|
---|
| 286 |
|
---|
| 287 | foreach $line (@textlist){
|
---|
| 288 | foreach $phrase (keys %table){
|
---|
| 289 | if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
|
---|
| 290 | $index = $1;
|
---|
| 291 | if($table{"$phrase"} eq "$index") { #if phrase exists in index
|
---|
| 292 | $ids = $3; #get all doc IDs for that keyphrase
|
---|
| 293 | if($ids !~ /$ID/){ #if doc ID not already included
|
---|
| 294 | $num_docs = $2;
|
---|
| 295 | $num_docs++; #increment number of docs
|
---|
| 296 | $line = "$index:$num_docs$3|$ID"; #line to append to index
|
---|
| 297 | $table{"$phrase"} = "0";
|
---|
| 298 | }
|
---|
| 299 | }
|
---|
| 300 | }
|
---|
| 301 | }
|
---|
| 302 | print INDEX_OUT "$line\n";
|
---|
| 303 | }
|
---|
| 304 | }
|
---|
| 305 |
|
---|
| 306 | #add new phrases to the index
|
---|
| 307 | foreach $phrase (keys %table){ #write 'phrase index:1:file ID
|
---|
| 308 | if($table{"$phrase"} ne "0"){
|
---|
| 309 | my $line = "$table{$phrase}:1:$ID";
|
---|
| 310 | print INDEX_OUT "$line\n";
|
---|
| 311 | }
|
---|
| 312 | }
|
---|
| 313 |
|
---|
| 314 | close(INDEX_OUT);
|
---|
| 315 |
|
---|
| 316 | }
|
---|
| 317 |
|
---|
| 318 | #This function is passed as arguments file hash ID and a list of kea phrases and/or stems
|
---|
| 319 | #that exist for that particular file and a list of the number of times each kea and/or stem
|
---|
| 320 | #phrase appear in that document. Its purpose is to write to the document_keyphrase
|
---|
| 321 | #index a line for the document it has been sent:
|
---|
| 322 | #-file ID:number of phrases and/or stems appear in the document
|
---|
| 323 | # |pairs of 'phrase index,number of times the phrase appears in the document'
|
---|
| 324 | sub document_keyphrase {
|
---|
| 325 |
|
---|
| 326 | my ($ID, $keaS, $kea_c) = @_;
|
---|
| 327 | my $text = "";
|
---|
| 328 | my @textlist;
|
---|
| 329 | my %phrases;
|
---|
| 330 | my $create_new_index = 0;
|
---|
| 331 |
|
---|
| 332 | print STDERR "writing to document_keyphrase.txt...\n";
|
---|
| 333 |
|
---|
| 334 | #split phrase counts into arrays
|
---|
| 335 | my @kea_counts = split(", ", $kea_c);
|
---|
| 336 |
|
---|
| 337 | #get table of phrases and phrase indexes
|
---|
| 338 | my %table = keyphrase_index_search($keaS);
|
---|
| 339 |
|
---|
| 340 | #split phrases into arrays
|
---|
| 341 | my @kea = split(", ", $keaS);
|
---|
| 342 |
|
---|
| 343 | #build new phrases dictionary
|
---|
| 344 | for($i=0; $i<=$#kea; $i++){
|
---|
| 345 | my $phrase = $table{"$kea[$i]"};
|
---|
| 346 | $phrases{"$phrase"} = "$kea_counts[$i]";
|
---|
| 347 | }
|
---|
| 348 |
|
---|
| 349 | my @num = keys %phrases;
|
---|
| 350 | my $phrasenum = $#num + 1; #number of phrases in doc
|
---|
| 351 |
|
---|
| 352 | #open index for reading
|
---|
| 353 | open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
|
---|
| 354 | or $create_new_index = 1;
|
---|
| 355 |
|
---|
| 356 |
|
---|
| 357 | if($create_new_index == 0){ #index doesn't need to be created
|
---|
| 358 |
|
---|
| 359 | while(<INDEX_IN>){
|
---|
| 360 | $text .= $_;
|
---|
| 361 | }
|
---|
| 362 |
|
---|
| 363 | close(INDEX_IN);
|
---|
| 364 |
|
---|
| 365 | #split text into lines
|
---|
| 366 | @textlist = split(/\n/, $text);
|
---|
| 367 |
|
---|
| 368 | }
|
---|
| 369 |
|
---|
| 370 |
|
---|
| 371 | #must write this line to the file
|
---|
| 372 | #'document ID:num of phrases|phrase index, number of times phrases appears
|
---|
| 373 | my $newline = "$ID:$phrasenum";
|
---|
| 374 | foreach $phrase (keys %phrases){
|
---|
| 375 | $newline .= "|$phrase,$phrases{$phrase}";
|
---|
| 376 | }
|
---|
| 377 |
|
---|
| 378 | #open index for output
|
---|
| 379 | open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
|
---|
| 380 |
|
---|
| 381 | if($create_new_index == 1){ #create a new index
|
---|
| 382 |
|
---|
| 383 | print INDEX_OUT "$newline\n";
|
---|
| 384 |
|
---|
| 385 | } else {
|
---|
| 386 |
|
---|
| 387 | #if ID is already in the file write line overtop incase
|
---|
| 388 | #someone has modified the file. Otherwise add the line
|
---|
| 389 | #to the end of the file
|
---|
| 390 | my $found = 0;
|
---|
| 391 |
|
---|
| 392 | foreach $line (@textlist){
|
---|
| 393 | if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
|
---|
| 394 | $id = $1;
|
---|
| 395 | if($ID eq $id){ #id is already in the file
|
---|
| 396 | print INDEX_OUT "$newline\n"; #print line overtop
|
---|
| 397 | $found = 1;
|
---|
| 398 | } else {
|
---|
| 399 | print INDEX_OUT "$line\n"; #print old line out
|
---|
| 400 | }
|
---|
| 401 | }
|
---|
| 402 | }
|
---|
| 403 |
|
---|
| 404 | print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
|
---|
| 405 |
|
---|
| 406 | }
|
---|
| 407 |
|
---|
| 408 | close(INDEX_OUT);
|
---|
| 409 |
|
---|
| 410 | }
|
---|
| 411 |
|
---|