Context Navigation

relation.pl@ 24874

Last change on this file since 24874 was 24874, checked in by ak19, 12 years ago
Third set of commits to do with the migration of cgi-bin into common-src, so that upon make install, common-src\cgi-bin will be installed in cgi-bin\GSDLOS(GSDLARCH). The first commit was of changes to files in cgi-bin itself. The second commit was moving cgi-bin. This one involves changes to all the files referring to cgi-bin where this needs to be changed to cgi-bin\OS_and_ARCH.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	#!/usr/bin/perl
2
3	#data structures used:
4	#-document_keyphrase
5	#%document_phrases = ("hashID" => {"phrase" => "number of occurrences"})
6	#-keyphrase_document
7	#%phrases_document = ("phrase" => "no. of docs phrase occurs in")
8
9	#after calculation relational data, it will amend gml files
10	#to include this data while writing to a file ID_relatedlinks.txt
11	#the html links of the related files
12	#if option p is set then the user wishes to add the related documents
13	#as bookmarks to the pdf file
14
15	require "getopts.pl";
16	&Getopts('N:P'); #process option arguments
17
18
19	$gsdlhome = $ENV{'GSDLHOME'};
20	@directories; #list of directories for the collections
21	%phrases_document; #hash storing phrase index => no. of documents with phrase in it
22	%document_phrases; #hash of hashes ID => {phrase number => no. of phrases in ID}
23	$N = 0; #number of documents in the index (presumably in the collection)
24	%cosine_matrix; #2-dimensional matrix, for each pair of documents stores a cosine measure
25	$number_of_files = 1; #by default the top related file is returned
26	$do_pdfs = 0;
27
28	if($opt_N =~ /(\d)+/){ #if flag is set then return top N docs
29	$number_of_files = $opt_N;
30	}
31
32	#get server name and httpprefix
33	$httpprefix;
34	$servername = "nzdl2.cs.waikato.ac.nz"; #should be localhost
35
36	if($opt_P == 1){ #if flag is set
37	$do_pdfs = 1;
38
39	#get the httpprefix from gsdlsite.cfg
40	my $cgibin = "cgi-bin/$ENV{'GSDLOS'}";
41	$cgibin = $cgibin.$ENV{'GSDLARCH'} if defined $ENV{'GSDLARCH';}
42	open(CFG, "$gsdlhome/$cgibin/gsdlsite.cfg")
43	or die "$gsdlhome/$cgibin/gsdlsite.cfg could not be opened";
44
45	while(<CFG>){
46	chomp;
47	if(/httpprefix \/(.+)/){
48	$httpprefix = $1;
49	}
50	}
51	close CFG;
52	}
53
54	if(@ARGV){
55
56	@directories = @ARGV;
57
58	} else { #open collect directory and get a list of all collections
59	opendir(DIR, "$gsdlhome/collect");
60	@directories = grep(!/(^\.\|(CVS)\|(modelcol))/, readdir(DIR));
61	closedir(DIR);
62	}
63
64
65
66	#get the name of the directory the index will be stored in
67	$dirname = "";
68	foreach $collection (sort @directories){
69	$dirname .= $collection."_";
70	}
71	$dirname .= "indexes";
72	print STDERR "directory name: $dirname\n";
73
74	#open keyphrases_document.txt
75	open(KEY_FILE, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
76	or die "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt could not be opened";
77
78	print STDERR "\nreading keyphrase_document.txt...\n";
79
80	#read a list of phrase indexes and number of documents that phrase appears in
81	while(<KEY_FILE>){
82	chomp;
83	if(/(\d+)\:(\d+)(.*)/){ #all lines should be of this form
84	my $phrase_index = $1;
85	my $doc_num = $2;
86	$phrases_document{"$phrase_index"} = "$doc_num"; #build keyphrase hash
87	}
88	}
89
90	close(KEY_FILE);
91
92	print STDERR "\nreading document_keyphrase.txt...\n";
93
94	#open document_keyphrases.txt
95	open(DOC_FILE, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
96	or die "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt could not be opened";
97
98	#read a list of documents and the phrases that appear in them (& how many times they appear)
99	while(<DOC_FILE>){
100	chomp;
101	my %document;
102	if(/(\w+)\:(\d+)((\\|\d+,\d+)+)/){ #every line in the index of this form
103	my $ID = $1;
104	my $pairs = $3;
105	while ($pairs =~ s/\\|(\d+),(\d+)//) { #this is the list of keyphrases and numbers
106	my $phrase_index = $1;
107	my $num_phrase = $2;
108	$document{"$phrase_index"} = "$num_phrase"; #table to be stored in another table
109	}
110	$document_phrases{"$ID"} = ({%document});
111	$N++; #number of documents in the collection (which have kea metadata)
112	}
113	}
114
115	close(DOC_FILE);
116
117	#we do not want to recalculate measures ie if we have already calculated
118	#doc1 & doc2 we do not then want to calculate doc2 & doc1
119	#to achieve this we must first sort the list of ids so that both lists
120	#of documents will be in the same order (otherwise hash tables return values in
121	#no particular order). We then compare each document in the left
122	#column below against the documents in the right column:
123
124	#doc0 ->
125	#doc1 -> doc0
126	#doc2 -> doc0, doc 1
127	#doc3 -> doc0, doc 1, doc 2
128	#doc4 -> doc0, doc 1, doc 2, doc 3
129	#doc5 -> doc0, doc 1, doc 2, doc 3, doc 4
130	#etc
131
132	#this way we are not recalculating the same value and also not
133	#comparing each document to itself (which would result in the
134	#value 1) thus more than halving the processing time.
135
136	print STDERR "\ncalculating cosine measures for $N documents...\n";
137
138	$count = 0;
139	@ids = sort keys(%document_phrases);
140
141	#repeat for all pairs of documents
142	foreach $id1 (sort keys(%document_phrases)) {
143	my %table;
144
145	for($i=0; $i<$count; $i++){ #calculate the cosine measure and store
146	my $cosine_measure = &compare_docs($id1, $ids[$i]);
147	$table{"$ids[$i]"} = "$cosine_measure";
148	}
149	$cosine_matrix{"$id1"} = ({%table});
150	$count++; #each iteration the number of documents to compare against expands by 1
151	}
152
153	&addtogml(); #add the relational data we have just gathered to the documents gml files
154
155
156	#this function takes as an argument two document IDs and uses these to extract the
157	#keyphrases from the document_keyphrase data structure. It then calculates a
158	#'measure of relativity' [0-1] with which we can establish how similar the two
159	#documents are to each other. (0-not related, 1-same document). The equation
160	#used to establish this similarity:
161	#
162	# for all phrases
163	# in both d1, d2 (fd1p * loge(N/fp) (fd2p * loge(N/fp)
164	# cosine(d1, d2) = --------------------------------------------------------------
165	# sqrt(sum of phrases in d1(fd1p * loge(N/fp))) *
166	# sqrt(sum of phrases in d2(fd2p * loge(N/fp)))
167	#
168	#where d1 and d2 are lists of keyphrase and represent documents
169	#fd1p is the frequency that phrase p occurs in document d1
170	#fd2p is the frequency that phrase p occurs in document d2
171	#fp is the number of documents that have p as a keyphrase
172	#N is the number of documents in the collection
173	sub compare_docs {
174
175
176	my $ID1 = shift(@_); #ID for document one
177	my $ID2 = shift(@_); #ID for document two
178
179	my @phrases; #a list of phrases in document 1 and document 2
180	my @phrases1; #a list of phrases in document 1
181	my @phrases2; #a list of phrases in document 2
182
183
184	foreach $phrase (keys %{ $document_phrases{$ID1}}) { #list of phrases in doc1
185	push(@phrases1, $phrase);
186	}
187	foreach $phrase (keys %{ $document_phrases{$ID2}}) { #list of phrases in doc2
188	push(@phrases2, $phrase);
189	}
190	foreach $phrase1 (@phrases1) { #list holds intersection of doc1 and doc2
191	foreach $phrase2 (@phrases2) {
192	push(@phrases, $phrase1) if ($phrase1 == $phrase2);
193	}
194	}
195
196	#COSINE MEASURE
197	my $wqtwdt= 0;
198
199	foreach $phrase (@phrases){ #for all phrases ocurring in d1 and d2
200	#the frequency that phrase occurs in document d1
201	$fd1p = $document_phrases{$ID1}{$phrase};
202
203	#log base e(N/the number of documents that have phrase as a keyphrase)
204	$log_freq = log($N/$phrases_document{$phrase});
205
206	#the frequency that phrase occurs in document d2
207	$fd2p = $document_phrases{$ID2}{$phrase};
208	$sum = ($fd1p * $log_freq) * ($fd2p * $log_freq);
209	$wqtwdt += $sum;
210	}
211
212
213	my $wd = 0; #stores the calculation for wd
214
215	foreach $phrase (@phrases1){ #for all phrases ocurring in d1 and d2
216	#the frequency that phrase occurs in document d1
217	$fd1p = $document_phrases{$ID1}{$phrase};
218
219	#log base e($N/the number of documents that have phrase as a keyphrase)
220	$log_freq = log($N/$phrases_document{$phrase});
221	$sum = $fd1p * $log_freq;
222	$wd += ($sum * $sum); #sum squared
223	}
224
225
226	my $wq = 0; #stores the calculation for wq
227
228	foreach $phrase (@phrases2){ #for all phrases ocurring in d2
229	#the frequency that phrase occurs in document d2
230	$fd2p = $document_phrases{$ID2}{$phrase};
231
232	#log base e($N/the number of documents that have phrase as a keyphrase)
233	$log_freq = log($N/$phrases_document{$phrase});
234	$sum = $fd2p * $log_freq;
235	$wq += ($sum * $sum); #sum squared
236	}
237
238	my $wdwq = sqrt($wq) * sqrt($wd);
239	my $cosine = $wqtwdt / $wdwq;
240
241	return $cosine;
242
243	}
244
245	#this function adds the relational data we have collected to the gml files
246	sub addtogml {
247
248	print STDERR "\nadding relational data to $N gml documents...\n\n";
249	my @doclist;
250	my %filetable;
251	my $pattern = "kea="; #pattern to search for to find where to insert relation data
252
253	#open archive info for each collecticollection
254	foreach $collection (@directories){
255
256	open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
257	or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened";
258
259	#read a list of ID's and pathnames into a file table
260	while(<INFO>){
261	chomp;
262	my %idtable;
263	my $ID;
264	if(/(\w+)(\s+)([\w\.\/]+)/){ #format of the line
265	$ID = $1;
266	$path = $3;
267	$idtable{"$path"} = "$collection";
268	}
269	$filetable{"$ID"} = ({%idtable});
270	}
271
272	close(INFO);
273	}
274
275
276
277	#for each id in the matrix calculate a list of related documents
278	foreach $id (keys %cosine_matrix){
279
280	@doclist = &calculate_list($id); #gets list of documents with most relevant scores
281
282	my @path = keys %{$filetable{$id}}; #get filepath for document
283	my $collection = $filetable{$id}{$path[0]};
284
285	#open gml file to amend for each collection
286	open(FILE, "$gsdlhome/collect/$collection/archives/$path[0]")
287	or die "$gsdlhome/collect/$collection/archives/$path[0] could not be opened";
288
289	#read the gml file into text
290	$text = "";
291	while(<FILE>){
292	$text .= $_;
293	}
294
295	close(FILE);
296
297	#delete previous relational data and urllinks to files storing rel doc links
298	$text =~ s/(\s)+relation=\"([^\"]*)\"//g;
299	$text =~ s/(\s)+urllink=\"([^\"]*)\"//g;
300	$text =~ s/(\s)+\/urllink=\"([^\"]*)\"//g;
301
302	#if we want to insert each relation item as 'relation1="id" relation2="id"'
303	#$count = 1; #insert each item in the list
304	#foreach $item (@doclist){ #insert list into text
305	# $text =~ s/(\s)($pattern)/ relation$count=\"$item\" $2/g;
306	# $count++;
307	#}
308
309	#or relation="id, id, id"
310	#my $relation = join(",", @doclist);
311
312	#if pdf flag is set then delete pdf url document
313	if($do_pdfs == 1){
314	my $dirpath = $path[0];
315	$dirpath =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
316	`rm $gsdlhome/collect/$collection/index/assoc/$dirpath/url.txt`;
317	}
318
319	my $title = "related document";
320	$title = $2 if($text =~ /(\s)+Title=\"([^\"]*)\"/g);
321
322	#relation="collection,id collection,id"
323	my @relationlist;
324	my $relation = "";
325	foreach $doc (@doclist){
326	my @p = keys %{$filetable{$doc}};
327	my $collect = $filetable{$doc}{$p[0]};
328	push(@relationlist, "$collect,$doc");
329
330	#write pdf docs to file
331	if($do_pdfs == 1){
332	my @rel_path = keys %{$filetable{$doc}};
333	&write_related_urls($title, $collection, $path[0], $collect, $rel_path[0]);
334	}
335	}
336
337	#modify the text to include relation data
338	$relation = join(" ", @relationlist);
339	$text =~ s/($pattern)/ relation=\"$relation\" $1/g;
340
341	#open gml file to write back new text
342	open(FILE, ">$gsdlhome/collect/$collection/archives/$path[0]") or print STDERR "NO!!!!\n";
343	print FILE "$text"; #write back text
344	close(FILE);
345
346	#amend the pdf file
347	if($do_pdfs == 1){
348	my $pdf_path = $path[0];
349	$pdf_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
350	`perl amend_pdf.pl $gsdlhome/collect/$collection/index/assoc/$pdf_path/doc.pdf $gsdlhome/collect/$collection/index/assoc/$pdf_path/url.txt\n`;
351	}
352	}
353	}
354
355	#this function builds a list of cosine measures for each
356	#document and sorts the list of ids each measure belongs to
357	#in reverse order (ie docs with greatest cosine measure first)
358	#returns the list calculated
359	sub calculate_list {
360
361	my $document = shift(@_);
362	my %measures;
363	my @doclist;
364
365
366	#find the top $number_of_files for $document
367	foreach $id (keys %{$cosine_matrix{$document}}){
368	$measures{"$cosine_matrix{$document}{$id}"} = "$id";
369	}
370
371	my @list = reverse sort {$a<=>$b} keys %measures;
372
373	#list is as big as specified in command line (default is 1)
374	for($i = 0; $i<$number_of_files; $i++){
375	my $id = $measures{$list[$i]};
376	push(@doclist, $id) if($id ne "");
377	}
378
379	return @doclist;
380	}
381
382
383	sub write_related_urls {
384
385
386	#need server name eg nzdl2.cs.waikato.ac.nz
387	#localhost should work but doesn't on this computer?
388	#open up config file to read httpprefix
389	#get collection name of related document
390	#get hash directory of related document
391	#write http://nzdl2.cs.waikato.ac.nz/httpprefix/collectionname/index/assoc/directory/doc.pdf
392
393	my ($title, $collection, $path, $collect, $rel_path) = @_;
394
395	$rel_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
396	$path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
397
398	print STDERR "writing related pdf urls to file ";
399	print STDERR "$gsdlhome/collect/$collection/index/assoc/$path/url.txt...\n";
400
401	open(URL, ">>$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
402	or open(URL, ">$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
403	or print STDERR "This file $title is not a pdf file\n";
404
405	print URL "$title\t";
406	print URL "http://$servername/$httpprefix/collect/$collect/index/assoc/$rel_path/doc.pdf\n";
407	close URL;
408
409	}
410
411
412
413
414
415
416

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/bin/script/indexes/relation.pl@ 24874

Download in other formats: