Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/bin/script/indexes/buildkpi.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 14.3 KB

Line
1	#! /user/bin/perl
2
3	#usage: perl buildkpi.pl [-R] [collection] [collection] etc
4	#
5	#-r or -R will remove previous index files so that you may build new ones
6	#
7	#The program performs the following tasks:
8	#-gathers the specified collections on the command line OR
9	#-gathers the directories of all the collections in the collect directory, this is all
10	# the directories apart from CVS, modelcol, . and .. which are not collections.
11	#-It then retrieves the archive.inf file from the archive directory of each collection
12	# to obtain the unique file ID and filepath of every document in the collection
13	#-Then parse through each doc.gml stored in filepath to gather information
14	#-From each file collect kea phrases and/or stems
15	#-Determine the number of kea phrases and stems for each document
16	#-Search for the kea phrases in the numbered phrase index. If a phrase is not there then
17	# the program will write the kea phrase to the phrase index .
18	#-Search for the kea phrases in the keyphrase to document index. If the phrase is there,
19	# it will increment the number of documents that the keyphrase appears in and replace that
20	# then append the hash ID to the list of documents in the entry. If the phrase is not there
21	# then the program will write the kea phrase to the phrase index.
22	#-Then write document ID, no of phrase, phrase number from index followed by number of times
23	# phrase appears into the document_keyphrase index
24
25	$gsdlhome = $ENV{'GSDLHOME'};
26
27	require "getopts.pl";
28	&Getopts('R'); #process option arguments
29
30
31	#collections may be specified in the command line
32	#otherwise, all collections will be used to build
33	#the indexes.
34	if(@ARGV){
35
36	@directories = @ARGV;
37
38	} else { #open collect directory and get a list of all collections
39	opendir(DIR, "$gsdlhome/collect");
40	@directories = grep(!/(^\.\|(CVS)\|(modelcol))/, readdir(DIR));
41	closedir(DIR);
42	}
43
44
45	#directory to store indexes in using collection names
46	$dirname = "";
47	foreach $collection (sort @directories){
48	$dirname .= $collection."_";
49	}
50	$dirname .= "indexes";
51	print STDERR "directory name: $dirname\n";
52
53	#if option R remove all previous indexes
54	if($opt_R == 1){ #remove indexes
55	print STDERR "\nremoving $gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt\n";
56	print STDERR "removing $gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt\n";
57	print STDERR "removing $gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt\n";
58	system("rm $gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt");
59	system("rm $gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt");
60	system("rm $gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt");
61	}
62
63	#create the new directory, display an error message if the directory
64	#already exists.
65	`mkdir --verbose $gsdlhome/bin/script/indexes/$dirname`;
66
67	#for each collection specified to build indexes for
68	foreach $collection (@directories){
69
70	print STDERR "\nBUILDING INDEXES FOR COLLECTION $collection\n\n";
71
72	my @filelist;
73
74	#archives.inf contains a list of unique hash ID's of each file and file paths
75	open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
76	or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
77
78	while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
79	chomp;
80	push(@filelist, $_);
81	}
82
83	foreach $file (@filelist){ #add each document to the indexes
84	build_index($file, $collection);
85	}
86	}
87
88	#This function opens the file in the filepath sent as an argument. From this it obtains
89	#the kea and/or stem data, and then searches for these phrases in the file, counting and storing
90	#how many times each phrase appears. The data is then sent to function keyphrase_document
91	#with arguments hash ID, kea phrases and stem phrases to build the keyphrase_document index.
92	#The function which builds the document_keyphrase index is then passed the hash ID, the kea
93	#phrases and/or the stemmed phrases and the array/s which hold the number of times each phrase
94	#appears in the document so that the data it has collected can be written to document_ keyphrase
95	#index.
96
97	sub build_index {
98
99	my $args = shift(@_);
100	my $collection = shift(@_);
101	my ($ID, $filepath) = split(/\t/, $args);
102	my $keaS = "";
103	my $stemsS = "";
104	my @kea_phrase_counts = 0;
105	my @stem_phrase_counts = 0;
106	my $text = "";
107
108	print STDERR "\nID: $ID\n";
109	print STDERR "filepath: $filepath\n";
110
111	#open file to extract keyphrase information
112	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
113	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
114
115	#patterns to search for so that we can extract the kea information
116	my $kea_search = ".* kea=\"([^\"]*)\"";
117	my $stem_search = "stems=\"([^\"]*)\"";
118
119	while(<FILE>){ #get kea and stem data and store
120	chomp;
121	$keaS = $1 if (/$kea_search/);
122	$stemsS = $1 if (/$stem_search/);
123	}
124
125	close(FILE);
126
127	print STDERR "Kea: $keaS\n";
128	print STDERR "stems: $stemsS\n";
129
130	my @kea = split(", ", $keaS);
131	my @stems = split(", ", $stemsS);
132
133	if(@kea && @stems){ #if the data exists
134
135	#open the filepath to the current document
136	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
137	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
138
139	while(<FILE>){ #get the text
140	chomp;
141	$text .= $_;
142	}
143
144	#chop out all things in angled brackets
145	$text =~ s/(<[^>]*>)//g;
146
147	#initilise counts
148	for($i=0; $i<=$#kea; $i++){
149	$kea_phrase_counts[$i] = 0;
150	}
151
152	for($i=0; $i<=$#stems; $i++){
153	$stem_phrase_counts[$i] = 0;
154	}
155
156	print STDERR "counting number of kea phrases in document...\n";
157
158	#using regular expressions generated from kea-reg and stem-reg
159	#count how many of each phrase appear in the document
160	$text_copy = $text;
161	for($i=0; $i<=$#kea; $i++){ #search for text with kea phrases
162	my $phrase = $kea[$i];
163	$reg = &kea_reg(split(/\s+/, $phrase));
164	while($text_copy =~ s/$reg//i){
165	$kea_phrase_counts[$i]++; #count the number of kea phrases
166	}
167	$text_copy = $text;
168	}
169
170	print STDERR "counting number of stemmed phrases in document...\n";
171
172	$text_copy = $text;
173	for($i=0; $i<=$#stems; $i++){ #search for text with stem phrases
174	my $stem = $stems[$i];
175	$reg = &stem_reg(split(/\s+/, $stem));
176	while($text_copy =~ s/$reg//i){
177	$stem_phrase_counts[$i]++; #count the number of stem phrases
178	}
179	$text_copy = $text;
180	}
181
182
183	#write data to keyphrase_document index
184	&keyphrase_document($ID, $keaS, $stemsS);
185
186	#write data to document_keyphrase index
187	$kea_counts = join(", ", @kea_phrase_counts);
188	$stem_counts = join(", ", @stem_phrase_counts);
189	&document_keyphrase($ID, $keaS, $stemsS, $kea_counts, $stem_counts);
190
191	} else {
192	print STDERR "No kea data was found in file $filepath\n";
193	}
194
195	}
196
197	#returns a regular expression designed to
198	#search for stems in text
199	#eg 'agri cari'
200	# agri followed by 0 or more non-whitespace characters
201	# followed by one or more whitespace OR 0 or 1 non-whitespace characters
202	# cari followed by 0 or more non-whitespace characters
203	#modified from original by Stephen Lundy
204
205	sub stem_reg {
206
207	$regexp = "";
208
209	$l = @_;
210
211	if ($l > 0) {
212	$s = shift;
213	$regexp = "$s\\S*";
214
215	if ($l-1 > 0) {
216	foreach $s (@_) {
217	$regexp .= "(\\s+\|\\S?)$s\\S*";
218	}
219	}
220	}
221
222	return $regexp;
223	}
224
225	#returns a regular expression designed to
226	#search for phrases in text
227	#eg 'agris caris'
228	# agris followed by 0 or 1 non-whitespace characters OR
229	# followed by one or more whitespace
230	# caris followed by 0 or 1 non-whitespace characters
231	#modified from original by Stephen Lundy
232
233	sub kea_reg {
234	$regexp = "";
235
236	$l = @_;
237
238	if ($l > 0) {
239	$s = shift;
240	#$regexp = "$s(\\S?)";
241	$regexp = "$s(\\s+\|\\S?)";
242
243	if ($l-1 > 0) {
244	foreach $s (@_) {
245	$regexp .= "$s(\\s+\|\\S?)";
246	}
247	}
248	}
249
250	return $regexp;
251	}
252
253
254	#This function is passed as arguments a list of kea phrases and/or stems. Its purpose is to
255	#check in the keyphrase index file for each phrase and determine whether or not an entry has
256	#been made for that phrase and an index number assigned to it. If there has not been an entry
257	#made then an index number is assigned to the phrase and it is written to the file. This
258	#function is called by document_keyphrase and keyphrase_document. Each line in the file has
259	#this form:
260	#-phrase index number:phrase
261	#This function then returns a table of pairs of the phrases that were sent as arguments to it
262	#{phrase => phrase index number}.
263
264	sub keyphrase_index_search {
265
266	my $phrases = shift(@_);
267	my @phrases = split(", ", $phrases);
268	my %table;
269	my $index = 1;
270	my $create_new_index = 0;
271
272	print STDERR "searching keyphrase index...\n";
273
274	#initilise table of phrases and index numbers
275	foreach $phrase (@phrases){
276	$table{"$phrase"} = "0";
277	}
278
279	#open keyphrase index for appending data and for reading
280	open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt");
281	open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_index.txt")
282	or $create_new_index = 1;
283
284	if($create_new_index == 0){
285	#if the index already exists read in the phrases
286	while(<INDEX_IN>){
287	chomp;
288	foreach $phrase (@phrases){
289	if(/(\d+):$phrase/){
290	$index = $1;
291	$table{"$phrase"} = "$index";
292	}
293	}
294	$index++; #new starting index (one + the last index)
295	}
296
297	close(INDEX_IN);
298
299	}
300
301	#add new phrases to the phrase index
302	foreach $phrase (keys %table){
303	if($table{"$phrase"} eq "0"){
304	print INDEX_OUT "$index:$phrase\n";
305	$table{"$phrase"} = "$index";
306	$index++;
307	}
308	}
309
310	close(INDEX_OUT);
311	return %table;
312	}
313
314	#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
315	#that exist for that particular file. Its purpose is to write to the keyphrase_document
316	#index a line for the document it has been sent:
317	#-phrase index number:number of documents it appears in\|ID
318	sub keyphrase_document{
319
320	my ($ID, $kea, $stems) = @_;
321	my $text = "";
322	my @textlist;
323	my $create_new_index = 0;
324
325	print STDERR "writing to keyphrase_document.txt...\n";
326
327	#get table of phrases and phrase indexes
328	my %table = keyphrase_index_search($kea.", ".$stems);
329
330
331	#open index for reading
332	open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
333	or $create_new_index = 1;
334
335	#read in document if file exists
336	if($create_new_index == 0){
337
338	while(<INDEX_IN>){
339	$text .= $_;
340	}
341
342	close(INDEX_IN);
343
344	#split text into lines
345	@textlist = split(/\n/, $text);
346
347	}
348
349	#open index for output
350	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt");
351
352	if($create_new_index == 0){ #amend existing index
353
354	foreach $line (@textlist){
355	foreach $phrase (keys %table){
356	if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
357	$index = $1;
358	if($table{"$phrase"} eq "$index") { #if phrase exists in index
359	$ids = $3; #get all doc IDs for that keyphrase
360	if($ids !~ /$ID/){ #if doc ID not already included
361	$num_docs = $2;
362	$num_docs++; #increment number of docs
363	$line = "$index:$num_docs$3\|$ID"; #line to append to index
364	$table{"$phrase"} = "0";
365	}
366	}
367	}
368	}
369	print INDEX_OUT "$line\n";
370	}
371	}
372
373	#add new phrases to the index
374	foreach $phrase (keys %table){ #write 'phrase index:1:file ID
375	if($table{"$phrase"} ne "0"){
376	my $line = "$table{$phrase}:1:$ID";
377	print INDEX_OUT "$line\n";
378	}
379	}
380
381	close(INDEX_OUT);
382
383	}
384
385	#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
386	#that exist for that particular file and a list of the number of times each kea and/or stem
387	#phrase appear in that document. Its purpose is to write to the document_keyphrase
388	#index a line for the document it has been sent:
389	#-file ID:number of phrases and/or stems appear in the document
390	# \|pairs of 'phrase index,number of times the phrase appears in the document'
391	sub document_keyphrase {
392
393	my ($ID, $keaS, $stemsS, $kea_c, $stem_c) = @_;
394	my $text = "";
395	my @textlist;
396	my %phrases;
397	my $create_new_index = 0;
398
399	print STDERR "writing to document_keyphrase.txt...\n";
400
401	#split phrase counts into arrays
402	my @kea_counts = split(", ", $kea_c);
403	my @stem_counts = split(", ", $stem_c);
404
405	#get table of phrases and phrase indexes
406	my %table = keyphrase_index_search($keaS.", ".$stemsS);
407
408	#split phrases into arrays
409	my @kea = split(", ", $keaS);
410	my @stems = split(", ", $stemsS);
411
412	#build new phrases dictionary
413	for($i=0; $i<=$#kea; $i++){
414	my $phrase = $table{"$kea[$i]"};
415	if($kea_counts[$i] > 0){
416	$phrases{"$phrase"} = "$kea_counts[$i]";
417	} else {
418	$phrases{"$phrase"} = 1;
419	}
420	}
421	for($i=0; $i<=$#stems; $i++){
422	my $phrase = $table{"$stems[$i]"};
423	if($stem_counts[$i] > 0){
424	$phrases{"$phrase"} = "$stem_counts[$i]";
425	} else {
426	$phrases{"$phrase"} = 1;
427	}
428	}
429	my @num = keys %phrases;
430	my $phrasenum = $#num + 1; #number of phrases in doc
431
432	#open index for reading
433	open(INDEX_IN, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
434	or $create_new_index = 1;
435
436
437	if($create_new_index == 0){ #index doesn't need to be created
438
439	while(<INDEX_IN>){
440	$text .= $_;
441	}
442
443	close(INDEX_IN);
444
445	#split text into lines
446	@textlist = split(/\n/, $text);
447
448	}
449
450
451	#must write this line to the file
452	#document ID:num of phrases\|phrase index, number of times phrases appears
453	my $newline = "$ID:$phrasenum";
454	foreach $phrase (keys %phrases){
455	$newline .= "\|$phrase,$phrases{$phrase}";
456	}
457
458	#open index for output
459	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt");
460
461	if($create_new_index == 1){ #create a new index
462
463	print INDEX_OUT "$newline\n";
464
465	} else {
466
467	#if ID is already in the file write line overtop incase
468	#someone has modified the file. Otherwise add the line
469	#to the end of the file
470	my $found = 0;
471
472	foreach $line (@textlist){
473	if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
474	$id = $1;
475	if($ID eq $id){ #id is already in the file
476	print INDEX_OUT "$newline\n"; #print line overtop
477	$found = 1;
478	} else {
479	print INDEX_OUT "$line\n"; #print old line out
480	}
481	}
482	}
483
484	print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
485
486	}
487
488	close(INDEX_OUT);
489
490	}
491

Note: See TracBrowser for help on using the repository browser.

Download in other formats: