Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/bin/script/indexes/relation.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 12.9 KB

Line
1	#!/usr/bin/perl
2
3	#data structures used:
4	#-document_keyphrase
5	#%document_phrases = ("hashID" => {"phrase" => "number of occurrences"})
6	#-keyphrase_document
7	#%phrases_document = ("phrase" => "no. of docs phrase occurs in")
8
9	#after calculation relational data, it will amend gml files
10	#to include this data while writing to a file ID_relatedlinks.txt
11	#the html links of the related files
12	#if option p is set then the user wishes to add the related documents
13	#as bookmarks to the pdf file
14
15	require "getopts.pl";
16	&Getopts('N:P'); #process option arguments
17
18
19	$gsdlhome = $ENV{'GSDLHOME'};
20	@directories; #list of directories for the collections
21	%phrases_document; #hash storing phrase index => no. of documents with phrase in it
22	%document_phrases; #hash of hashes ID => {phrase number => no. of phrases in ID}
23	$N = 0; #number of documents in the index (presumably in the collection)
24	%cosine_matrix; #2-dimensional matrix, for each pair of documents stores a cosine measure
25	$number_of_files = 1; #by default the top related file is returned
26	$do_pdfs = 0;
27
28	if($opt_N =~ /(\d)+/){ #if flag is set then return top N docs
29	$number_of_files = $opt_N;
30	}
31
32	#get server name and httpprefix
33	$httpprefix;
34	$servername = "nzdl2.cs.waikato.ac.nz"; #should be localhost
35
36	if($opt_P == 1){ #if flag is set
37	$do_pdfs = 1;
38
39	#get the httpprefix from gsdlsite.cfg
40	open(CFG, "$gsdlhome/cgi-bin/gsdlsite.cfg")
41	or die "$gsdlhome/cgi-bin/gsdlsite.cfg could not be opened";
42
43	while(<CFG>){
44	chomp;
45	if(/httpprefix \/(.+)/){
46	$httpprefix = $1;
47	}
48	}
49	close CFG;
50	}
51
52	if(@ARGV){
53
54	@directories = @ARGV;
55
56	} else { #open collect directory and get a list of all collections
57	opendir(DIR, "$gsdlhome/collect");
58	@directories = grep(!/(^\.\|(CVS)\|(modelcol))/, readdir(DIR));
59	closedir(DIR);
60	}
61
62
63
64	#get the name of the directory the index will be stored in
65	$dirname = "";
66	foreach $collection (sort @directories){
67	$dirname .= $collection."_";
68	}
69	$dirname .= "indexes";
70	print STDERR "directory name: $dirname\n";
71
72	#open keyphrases_document.txt
73	open(KEY_FILE, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
74	or die "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt could not be opened";
75
76	print STDERR "\nreading keyphrase_document.txt...\n";
77
78	#read a list of phrase indexes and number of documents that phrase appears in
79	while(<KEY_FILE>){
80	chomp;
81	if(/(\d+)\:(\d+)(.*)/){ #all lines should be of this form
82	my $phrase_index = $1;
83	my $doc_num = $2;
84	$phrases_document{"$phrase_index"} = "$doc_num"; #build keyphrase hash
85	}
86	}
87
88	close(KEY_FILE);
89
90	print STDERR "\nreading document_keyphrase.txt...\n";
91
92	#open document_keyphrases.txt
93	open(DOC_FILE, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
94	or die "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt could not be opened";
95
96	#read a list of documents and the phrases that appear in them (& how many times they appear)
97	while(<DOC_FILE>){
98	chomp;
99	my %document;
100	if(/(\w+)\:(\d+)((\\|\d+,\d+)+)/){ #every line in the index of this form
101	my $ID = $1;
102	my $pairs = $3;
103	while ($pairs =~ s/\\|(\d+),(\d+)//) { #this is the list of keyphrases and numbers
104	my $phrase_index = $1;
105	my $num_phrase = $2;
106	$document{"$phrase_index"} = "$num_phrase"; #table to be stored in another table
107	}
108	$document_phrases{"$ID"} = ({%document});
109	$N++; #number of documents in the collection (which have kea metadata)
110	}
111	}
112
113	close(DOC_FILE);
114
115	#we do not want to recalculate measures ie if we have already calculated
116	#doc1 & doc2 we do not then want to calculate doc2 & doc1
117	#to achieve this we must first sort the list of ids so that both lists
118	#of documents will be in the same order (otherwise hash tables return values in
119	#no particular order). We then compare each document in the left
120	#column below against the documents in the right column:
121
122	#doc0 ->
123	#doc1 -> doc0
124	#doc2 -> doc0, doc 1
125	#doc3 -> doc0, doc 1, doc 2
126	#doc4 -> doc0, doc 1, doc 2, doc 3
127	#doc5 -> doc0, doc 1, doc 2, doc 3, doc 4
128	#etc
129
130	#this way we are not recalculating the same value and also not
131	#comparing each document to itself (which would result in the
132	#value 1) thus more than halving the processing time.
133
134	print STDERR "\ncalculating cosine measures for $N documents...\n";
135
136	$count = 0;
137	@ids = sort keys(%document_phrases);
138
139	#repeat for all pairs of documents
140	foreach $id1 (sort keys(%document_phrases)) {
141	my %table;
142
143	for($i=0; $i<$count; $i++){ #calculate the cosine measure and store
144	my $cosine_measure = &compare_docs($id1, $ids[$i]);
145	$table{"$ids[$i]"} = "$cosine_measure";
146	}
147	$cosine_matrix{"$id1"} = ({%table});
148	$count++; #each iteration the number of documents to compare against expands by 1
149	}
150
151	&addtogml(); #add the relational data we have just gathered to the documents gml files
152
153
154	#this function takes as an argument two document IDs and uses these to extract the
155	#keyphrases from the document_keyphrase data structure. It then calculates a
156	#'measure of relativity' [0-1] with which we can establish how similar the two
157	#documents are to each other. (0-not related, 1-same document). The equation
158	#used to establish this similarity:
159	#
160	# for all phrases
161	# in both d1, d2 (fd1p * loge(N/fp) (fd2p * loge(N/fp)
162	# cosine(d1, d2) = --------------------------------------------------------------
163	# sqrt(sum of phrases in d1(fd1p * loge(N/fp))) *
164	# sqrt(sum of phrases in d2(fd2p * loge(N/fp)))
165	#
166	#where d1 and d2 are lists of keyphrase and represent documents
167	#fd1p is the frequency that phrase p occurs in document d1
168	#fd2p is the frequency that phrase p occurs in document d2
169	#fp is the number of documents that have p as a keyphrase
170	#N is the number of documents in the collection
171	sub compare_docs {
172
173
174	my $ID1 = shift(@_); #ID for document one
175	my $ID2 = shift(@_); #ID for document two
176
177	my @phrases; #a list of phrases in document 1 and document 2
178	my @phrases1; #a list of phrases in document 1
179	my @phrases2; #a list of phrases in document 2
180
181
182	foreach $phrase (keys %{ $document_phrases{$ID1}}) { #list of phrases in doc1
183	push(@phrases1, $phrase);
184	}
185	foreach $phrase (keys %{ $document_phrases{$ID2}}) { #list of phrases in doc2
186	push(@phrases2, $phrase);
187	}
188	foreach $phrase1 (@phrases1) { #list holds intersection of doc1 and doc2
189	foreach $phrase2 (@phrases2) {
190	push(@phrases, $phrase1) if ($phrase1 == $phrase2);
191	}
192	}
193
194	#COSINE MEASURE
195	my $wqtwdt= 0;
196
197	foreach $phrase (@phrases){ #for all phrases ocurring in d1 and d2
198	#the frequency that phrase occurs in document d1
199	$fd1p = $document_phrases{$ID1}{$phrase};
200
201	#log base e(N/the number of documents that have phrase as a keyphrase)
202	$log_freq = log($N/$phrases_document{$phrase});
203
204	#the frequency that phrase occurs in document d2
205	$fd2p = $document_phrases{$ID2}{$phrase};
206	$sum = ($fd1p * $log_freq) * ($fd2p * $log_freq);
207	$wqtwdt += $sum;
208	}
209
210
211	my $wd = 0; #stores the calculation for wd
212
213	foreach $phrase (@phrases1){ #for all phrases ocurring in d1 and d2
214	#the frequency that phrase occurs in document d1
215	$fd1p = $document_phrases{$ID1}{$phrase};
216
217	#log base e($N/the number of documents that have phrase as a keyphrase)
218	$log_freq = log($N/$phrases_document{$phrase});
219	$sum = $fd1p * $log_freq;
220	$wd += ($sum * $sum); #sum squared
221	}
222
223
224	my $wq = 0; #stores the calculation for wq
225
226	foreach $phrase (@phrases2){ #for all phrases ocurring in d2
227	#the frequency that phrase occurs in document d2
228	$fd2p = $document_phrases{$ID2}{$phrase};
229
230	#log base e($N/the number of documents that have phrase as a keyphrase)
231	$log_freq = log($N/$phrases_document{$phrase});
232	$sum = $fd2p * $log_freq;
233	$wq += ($sum * $sum); #sum squared
234	}
235
236	my $wdwq = sqrt($wq) * sqrt($wd);
237	my $cosine = $wqtwdt / $wdwq;
238
239	return $cosine;
240
241	}
242
243	#this function adds the relational data we have collected to the gml files
244	sub addtogml {
245
246	print STDERR "\nadding relational data to $N gml documents...\n\n";
247	my @doclist;
248	my %filetable;
249	my $pattern = "kea="; #pattern to search for to find where to insert relation data
250
251	#open archive info for each collecticollection
252	foreach $collection (@directories){
253
254	open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
255	or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened";
256
257	#read a list of ID's and pathnames into a file table
258	while(<INFO>){
259	chomp;
260	my %idtable;
261	my $ID;
262	if(/(\w+)(\s+)([\w\.\/]+)/){ #format of the line
263	$ID = $1;
264	$path = $3;
265	$idtable{"$path"} = "$collection";
266	}
267	$filetable{"$ID"} = ({%idtable});
268	}
269
270	close(INFO);
271	}
272
273
274
275	#for each id in the matrix calculate a list of related documents
276	foreach $id (keys %cosine_matrix){
277
278	@doclist = &calculate_list($id); #gets list of documents with most relevant scores
279
280	my @path = keys %{$filetable{$id}}; #get filepath for document
281	my $collection = $filetable{$id}{$path[0]};
282
283	#open gml file to amend for each collection
284	open(FILE, "$gsdlhome/collect/$collection/archives/$path[0]")
285	or die "$gsdlhome/collect/$collection/archives/$path[0] could not be opened";
286
287	#read the gml file into text
288	$text = "";
289	while(<FILE>){
290	$text .= $_;
291	}
292
293	close(FILE);
294
295	#delete previous relational data and urllinks to files storing rel doc links
296	$text =~ s/(\s)+relation=\"([^\"]*)\"//g;
297	$text =~ s/(\s)+urllink=\"([^\"]*)\"//g;
298	$text =~ s/(\s)+\/urllink=\"([^\"]*)\"//g;
299
300	#if we want to insert each relation item as 'relation1="id" relation2="id"'
301	#$count = 1; #insert each item in the list
302	#foreach $item (@doclist){ #insert list into text
303	# $text =~ s/(\s)($pattern)/ relation$count=\"$item\" $2/g;
304	# $count++;
305	#}
306
307	#or relation="id, id, id"
308	#my $relation = join(",", @doclist);
309
310	#if pdf flag is set then delete pdf url document
311	if($do_pdfs == 1){
312	my $dirpath = $path[0];
313	$dirpath =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
314	`rm $gsdlhome/collect/$collection/index/assoc/$dirpath/url.txt`;
315	}
316
317	my $title = "related document";
318	$title = $2 if($text =~ /(\s)+Title=\"([^\"]*)\"/g);
319
320	#relation="collection,id collection,id"
321	my @relationlist;
322	my $relation = "";
323	foreach $doc (@doclist){
324	my @p = keys %{$filetable{$doc}};
325	my $collect = $filetable{$doc}{$p[0]};
326	push(@relationlist, "$collect,$doc");
327
328	#write pdf docs to file
329	if($do_pdfs == 1){
330	my @rel_path = keys %{$filetable{$doc}};
331	&write_related_urls($title, $collection, $path[0], $collect, $rel_path[0]);
332	}
333	}
334
335	#modify the text to include relation data
336	$relation = join(" ", @relationlist);
337	$text =~ s/($pattern)/ relation=\"$relation\" $1/g;
338
339	#open gml file to write back new text
340	open(FILE, ">$gsdlhome/collect/$collection/archives/$path[0]") or print STDERR "NO!!!!\n";
341	print FILE "$text"; #write back text
342	close(FILE);
343
344	#amend the pdf file
345	if($do_pdfs == 1){
346	my $pdf_path = $path[0];
347	$pdf_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
348	`perl amend_pdf.pl $gsdlhome/collect/$collection/index/assoc/$pdf_path/doc.pdf $gsdlhome/collect/$collection/index/assoc/$pdf_path/url.txt\n`;
349	}
350	}
351	}
352
353	#this function builds a list of cosine measures for each
354	#document and sorts the list of ids each measure belongs to
355	#in reverse order (ie docs with greatest cosine measure first)
356	#returns the list calculated
357	sub calculate_list {
358
359	my $document = shift(@_);
360	my %measures;
361	my @doclist;
362
363
364	#find the top $number_of_files for $document
365	foreach $id (keys %{$cosine_matrix{$document}}){
366	$measures{"$cosine_matrix{$document}{$id}"} = "$id";
367	}
368
369	my @list = reverse sort {$a<=>$b} keys %measures;
370
371	#list is as big as specified in command line (default is 1)
372	for($i = 0; $i<$number_of_files; $i++){
373	my $id = $measures{$list[$i]};
374	push(@doclist, $id) if($id ne "");
375	}
376
377	return @doclist;
378	}
379
380
381	sub write_related_urls {
382
383
384	#need server name eg nzdl2.cs.waikato.ac.nz
385	#localhost should work but doesn't on this computer?
386	#open up config file to read httpprefix
387	#get collection name of related document
388	#get hash directory of related document
389	#write http://nzdl2.cs.waikato.ac.nz/httpprefix/collectionname/index/assoc/directory/doc.pdf
390
391	my ($title, $collection, $path, $collect, $rel_path) = @_;
392
393	$rel_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
394	$path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
395
396	print STDERR "writing related pdf urls to file ";
397	print STDERR "$gsdlhome/collect/$collection/index/assoc/$path/url.txt...\n";
398
399	open(URL, ">>$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
400	or open(URL, ">$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
401	or print STDERR "This file $title is not a pdf file\n";
402
403	print URL "$title\t";
404	print URL "http://$servername/$httpprefix/collect/$collect/index/assoc/$rel_path/doc.pdf\n";
405	close URL;
406
407	}
408
409
410
411
412
413
414

Note: See TracBrowser for help on using the repository browser.

Download in other formats: