Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/trunk/greenstone2/bin/script/indexes/buildkpiS.pl@ 24192

Last change on this file since 24192 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 12.0 KB

Line
1	#! /user/bin/perl
2
3	#usage: perl buildkpiS.pl [-R] [collection] [collection] etc
4	#
5	#-r or -R will remove previous index files so that you may build new ones
6	#
7	#The program performs the following tasks:
8	#-gathers the specified collections on the command line OR
9	#-gathers the directories of all the collections in the collect directory, this is all
10	# the directories apart from CVS, modelcol, . and .. which are not collections.
11	#-It then retrieves the archive.inf file from the archive directory of each collection
12	# to obtain the unique file ID and filepath of every document in the collection
13	#-Then parse through each doc.gml stored in filepath to gather information
14	#-From each file collect stems of kea keyphrases.
15	#-Determine the number of keyphrase stems for each document
16	#-Search for the keyphrase stems in the numbered phrase index. If a phrase is not there then
17	# the program will write the keyphrase stem to the phrase index .
18	#-Search for the keyphrase stem in the keyphrase to document index. If the stem is there,
19	# it will increment the number of documents that the keyphrase stem appears in and replace that
20	# then append the hash ID to the list of documents in the entry. If the stem is not there
21	# then the program will write the stem to the phrase index.
22	#-Then write document ID, no of phrase, phrase number from index followed by number of times
23	# phrase appears into the document_keyphrase index
24
25	$gsdlhome = $ENV{'GSDLHOME'};
26	$collection;
27
28	require "getopts.pl";
29	&Getopts('R'); #process option arguments
30
31	#if option R remove all previous indexes
32	if($opt_R == 1){ #remove indexes
33	print STDERR "\nremoving $gsdlhome/bin/script/indexes/keyphrase_index.txt\n";
34	print STDERR "removing $gsdlhome/bin/script/indexes/keyphrase_document.txt\n";
35	print STDERR "removing $gsdlhome/bin/script/indexes/document_keyphrase.txt\n";
36	system("rm $gsdlhome/bin/script/indexes/keyphrase_document.txt");
37	system("rm $gsdlhome/bin/script/indexes/document_keyphrase.txt");
38	system("rm $gsdlhome/bin/script/indexes/keyphrase_index.txt");
39	}
40
41	#collections may be specified in the command line
42	#otherwise, all collections will be used to build
43	#the indexes.
44	if(@ARGV){
45
46	@directories = @ARGV;
47
48	} else { #open collect directory and get a list of all collections
49	opendir(DIR, "$gsdlhome/collect");
50	@directories = grep(!/(^\.\|(CVS)\|(modelcol))/, readdir(DIR));
51	closedir(DIR);
52	}
53
54	#for each collection specified to build indexes for
55	foreach $collection (@directories){
56
57	my @filelist;
58
59	#archives.inf contains a list of unique hash ID's of each file and file paths
60	open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
61	or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened.";
62
63	while(<INFO>){ #get each line of text from archives.inf (OID \t filepath)
64	chomp;
65	push(@filelist, $_);
66	}
67
68	foreach $file (@filelist){ #add each document to the indexes
69	build_index($file, $collection);
70	}
71	}
72
73	#This function opens the file in the filepath sent as an argument. From this it obtains
74	#the kea and/or stem data, and then searches for these phrases in the file, counting and storing
75	#how many times each phrase appears. The data is then sent to function keyphrase_document
76	#with arguments hash ID, kea phrases and stem phrases to build the keyphrase_document index.
77	#The function which builds the document_keyphrase index is then passed the hash ID, the kea
78	#phrases and/or the stemmed phrases and the array/s which hold the number of times each phrase
79	#appears in the document so that the data it has collected can be written to document_ keyphrase
80	#index.
81
82	sub build_index {
83
84	my $args = shift(@_);
85	my $collection = shift(@_);
86	my ($ID, $filepath) = split(/\t/, $args);
87	my $stemsS = "";
88	my @stem_phrase_counts = 0;
89	my $text = "";
90
91	print STDERR "\nID: $ID\n";
92	print STDERR "filepath: $filepath\n";
93
94	#open file to extract keyphrase information
95	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
96	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
97
98	#patterns to search for so that we can extract the kea information
99	my $stem_search = "stems=\"([^\"]*)\"";
100
101	while(<FILE>){ #get kea and stem data and store
102	chomp;
103	$stemsS = $1 if (/$stem_search/);
104	}
105
106	close(FILE);
107
108	print STDERR "stems: $stemsS\n";
109
110	my @stems = split(", ", $stemsS);
111
112	if(@stems){ #if the data exists
113
114	#open the filepath to the current document
115	open(FILE, "$gsdlhome/collect/$collection/archives/$filepath")
116	or die "$gsdlhome/collect/$collection/archives/$filepath could not be opened.";
117
118	while(<FILE>){ #get the text
119	chomp;
120	$text .= $_;
121	}
122
123	#chop out all things in angled brackets
124	$text =~ s/(<[^>]*>)//g;
125
126	#initilise counts
127	for($i=0; $i<=$#stems; $i++){
128	$stem_phrase_counts[$i] = 0;
129	}
130
131	#using regular expressions generated from stem-reg
132	#count how many of each phrase appear in the document
133
134	$text_copy = $text;
135	for($i=0; $i<=$#stems; $i++){ #search for text with stem phrases
136	my $stem = $stems[$i];
137	$reg = &stem_reg(split(/\s+/, $stem));
138	while($text_copy =~ s/$reg//i){
139	$stem_phrase_counts[$i]++; #count the number of stem phrases
140	}
141	$text_copy = $text;
142	}
143
144
145	#write data to keyphrase_document index
146	&keyphrase_document($ID, $stemsS);
147
148	#write data to document_keyphrase index
149	$stem_counts = join(", ", @stem_phrase_counts);
150	&document_keyphrase($ID, $stemsS, $stem_counts);
151
152	} else {
153	print STDERR "No stem data was found in file $filepath\n";
154	}
155
156	}
157
158	#returns a regular expression designed to
159	#search for stems in text
160	#eg 'agri cari'
161	# agri followed by 0 or more non-whitespace characters
162	# followed by one or more whitespace OR 0 or 1 non-whitespace characters
163	# cari followed by 0 or more non-whitespace characters
164	#modified from original by Stephen Lundy
165
166	sub stem_reg {
167
168	$regexp = "";
169
170	$l = @_;
171
172	if ($l > 0) {
173	$s = shift;
174	$regexp = "$s\\S*";
175
176	if ($l-1 > 0) {
177	foreach $s (@_) {
178	$regexp .= "(\\s+\|\\S?)$s\\S*";
179	}
180	}
181	}
182
183	return $regexp;
184	}
185
186
187
188	#This function is passed as arguments a list of kea phrase stems. Its purpose is to
189	#check in the keyphrase index file for each phrase and determine whether or not an entry has
190	#been made for that phrase and an index number assigned to it. If there has not been an entry
191	#made then an index number is assigned to the phrase and it is written to the file. This
192	#function is called by document_keyphrase and keyphrase_document. Each line in the file has
193	#this form:
194	#-phrase index number:phrase
195	#This function then returns a table of pairs of the phrases that were sent as arguments to it
196	#{phrase => phrase index number}.
197
198	sub keyphrase_index_search {
199
200	my $phrases = shift(@_);
201	my @phrases = split(", ", $phrases);
202	my %table;
203	my $index = 1;
204	my $create_new_index = 0;
205
206	print STDERR "searching keyphrase index...\n";
207
208	#initilise table of phrases and index numbers
209	foreach $phrase (@phrases){
210	$table{"$phrase"} = "0";
211	}
212
213	#open keyphrase index for appending data and for reading
214	open(INDEX_OUT, ">>$gsdlhome/bin/script/indexes/keyphrase_index.txt");
215	open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_index.txt")
216	or $create_new_index = 1;
217
218	if($create_new_index == 0){
219	#if the index already exists read in the phrases
220	while(<INDEX_IN>){
221	chomp;
222	foreach $phrase (@phrases){
223	if(/(\d+):$phrase/){
224	$index = $1;
225	$table{"$phrase"} = "$index";
226	}
227	}
228	$index++; #new starting index (one + the last index)
229	}
230
231	close(INDEX_IN);
232
233	}
234
235	#add new phrases to the phrase index
236	foreach $phrase (keys %table){
237	if($table{"$phrase"} eq "0"){
238	print INDEX_OUT "$index:$phrase\n";
239	$table{"$phrase"} = "$index";
240	$index++;
241	}
242	}
243
244	close(INDEX_OUT);
245	return %table;
246	}
247
248	#This function is passed as arguments file hash ID and a list of kea phrases and/or stems
249	#that exist for that particular file. Its purpose is to write to the keyphrase_document
250	#index a line for the document it has been sent:
251	#-phrase index number:number of documents it appears in\|ID
252	sub keyphrase_document{
253
254	my ($ID, $stems) = @_;
255	my $text = "";
256	my @textlist;
257	my $create_new_index = 0;
258
259	print STDERR "writing to keyphrase_document.txt...\n";
260
261	#get table of phrases and phrase indexes
262	my %table = keyphrase_index_search($stems);
263
264
265	#open index for reading
266	open(INDEX_IN, "$gsdlhome/bin/script/indexes/keyphrase_document.txt")
267	or $create_new_index = 1;
268
269	#read in document if file exists
270	if($create_new_index == 0){
271
272	while(<INDEX_IN>){
273	$text .= $_;
274	}
275
276	close(INDEX_IN);
277
278	#split text into lines
279	@textlist = split(/\n/, $text);
280
281	}
282
283	#open index for output
284	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/keyphrase_document.txt");
285
286	if($create_new_index == 0){ #amend existing index
287
288	foreach $line (@textlist){
289	foreach $phrase (keys %table){
290	if($line =~ /(\d+):(\d+)(.*)/){ #all lines of this form
291	$index = $1;
292	if($table{"$phrase"} eq "$index") { #if phrase exists in index
293	$ids = $3; #get all doc IDs for that keyphrase
294	if($ids !~ /$ID/){ #if doc ID not already included
295	$num_docs = $2;
296	$num_docs++; #increment number of docs
297	$line = "$index:$num_docs$3\|$ID"; #line to append to index
298	$table{"$phrase"} = "0";
299	}
300	}
301	}
302	}
303	print INDEX_OUT "$line\n";
304	}
305	}
306
307	#add new phrases to the index
308	foreach $phrase (keys %table){ #write 'phrase index:1:file ID
309	if($table{"$phrase"} ne "0"){
310	my $line = "$table{$phrase}:1:$ID";
311	print INDEX_OUT "$line\n";
312	}
313	}
314
315	close(INDEX_OUT);
316
317	}
318
319	#This function is passed as arguments file hash ID and a list of kea phrase stems
320	#that exist for that particular file and a list of the number of times each stem
321	#phrase appear in that document. Its purpose is to write to the document_keyphrase
322	#index a line for the document it has been sent:
323	#-file ID:number of phrases and/or stems appear in the document
324	# \|pairs of 'phrase index,number of times the phrase appears in the document'
325	sub document_keyphrase {
326
327	my ($ID, $stemsS, $stem_c) = @_;
328	my $text = "";
329	my @textlist;
330	my %phrases;
331	my $create_new_index = 0;
332
333	print STDERR "writing to document_keyphrase.txt...\n";
334
335	#split phrase counts into arrays
336	my @stem_counts = split(", ", $stem_c);
337
338	#get table of phrases and phrase indexes
339	my %table = keyphrase_index_search($stemsS);
340
341	#split phrases into arrays
342	my @stems = split(", ", $stemsS);
343
344	#build new phrases dictionary
345	for($i=0; $i<=$#stems; $i++){
346	my $phrase = $table{"$stems[$i]"};
347	$phrases{"$phrase"} = "$stem_counts[$i]";
348	}
349	my @num = keys %phrases;
350	my $phrasenum = $#num + 1; #number of phrases in doc
351
352	#open index for reading
353	open(INDEX_IN, "$gsdlhome/bin/script/indexes/document_keyphrase.txt")
354	or $create_new_index = 1;
355
356
357	if($create_new_index == 0){ #index doesn't need to be created
358
359	while(<INDEX_IN>){
360	$text .= $_;
361	}
362
363	close(INDEX_IN);
364
365	#split text into lines
366	@textlist = split(/\n/, $text);
367
368	}
369
370
371	#must write this line to the file
372	#'document ID:num of phrases\|phrase index, number of times phrases appears
373	my $newline = "$ID:$phrasenum";
374	foreach $phrase (keys %phrases){
375	$newline .= "\|$phrase,$phrases{$phrase}";
376	}
377
378	#open index for output
379	open(INDEX_OUT, ">$gsdlhome/bin/script/indexes/document_keyphrase.txt");
380
381	if($create_new_index == 1){ #create a new index
382
383	print INDEX_OUT "$newline\n";
384
385	} else {
386
387	#if ID is already in the file write line overtop incase
388	#someone has modified the file. Otherwise add the line
389	#to the end of the file
390	my $found = 0;
391
392	foreach $line (@textlist){
393	if($line =~ /([^:]+):(.*)/){ #all lines should follow this pattern
394	$id = $1;
395	if($ID eq $id){ #id is already in the file
396	print INDEX_OUT "$newline\n"; #print line overtop
397	$found = 1;
398	} else {
399	print INDEX_OUT "$line\n"; #print old line out
400	}
401	}
402	}
403
404	print INDEX_OUT "$newline\n" if ($found == 0); #append new line to end of file
405
406	}
407
408	close(INDEX_OUT);
409
410	}
411
412
413
414
415
416
417
418
419
420

Note: See TracBrowser for help on using the repository browser.

Download in other formats: