source: trunk/gsdl/bin/script/indexes/relation.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1#!/usr/bin/perl
2
3#data structures used:
4#-document_keyphrase
5#%document_phrases = ("hashID" => {"phrase" => "number of occurrences"})
6#-keyphrase_document
7#%phrases_document = ("phrase" => "no. of docs phrase occurs in")
8
9#after calculation relational data, it will amend gml files
10#to include this data while writing to a file ID_relatedlinks.txt
11#the html links of the related files
12#if option p is set then the user wishes to add the related documents
13#as bookmarks to the pdf file
14
15require "getopts.pl";
16&Getopts('N:P'); #process option arguments
17
18
19$gsdlhome = $ENV{'GSDLHOME'};
20@directories; #list of directories for the collections
21%phrases_document; #hash storing phrase index => no. of documents with phrase in it
22%document_phrases; #hash of hashes ID => {phrase number => no. of phrases in ID}
23$N = 0; #number of documents in the index (presumably in the collection)
24%cosine_matrix; #2-dimensional matrix, for each pair of documents stores a cosine measure
25$number_of_files = 1; #by default the top related file is returned
26$do_pdfs = 0;
27
28if($opt_N =~ /(\d)+/){ #if flag is set then return top N docs
29 $number_of_files = $opt_N;
30}
31
32#get server name and httpprefix
33$httpprefix;
34$servername = "nzdl2.cs.waikato.ac.nz"; #should be localhost
35
36if($opt_P == 1){ #if flag is set
37 $do_pdfs = 1;
38
39 #get the httpprefix from gsdlsite.cfg
40 open(CFG, "$gsdlhome/cgi-bin/gsdlsite.cfg")
41 or die "$gsdlhome/cgi-bin/gsdlsite.cfg could not be opened";
42
43 while(<CFG>){
44 chomp;
45 if(/httpprefix \/(.+)/){
46 $httpprefix = $1;
47 }
48 }
49 close CFG;
50}
51
52if(@ARGV){
53
54 @directories = @ARGV;
55
56} else { #open collect directory and get a list of all collections
57 opendir(DIR, "$gsdlhome/collect");
58 @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
59 closedir(DIR);
60}
61
62
63
64#get the name of the directory the index will be stored in
65$dirname = "";
66foreach $collection (sort @directories){
67 $dirname .= $collection."_";
68}
69$dirname .= "indexes";
70print STDERR "directory name: $dirname\n";
71
72#open keyphrases_document.txt
73open(KEY_FILE, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
74 or die "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt could not be opened";
75
76print STDERR "\nreading keyphrase_document.txt...\n";
77
78#read a list of phrase indexes and number of documents that phrase appears in
79while(<KEY_FILE>){
80 chomp;
81 if(/(\d+)\:(\d+)(.*)/){ #all lines should be of this form
82 my $phrase_index = $1;
83 my $doc_num = $2;
84 $phrases_document{"$phrase_index"} = "$doc_num"; #build keyphrase hash
85 }
86}
87
88close(KEY_FILE);
89
90print STDERR "\nreading document_keyphrase.txt...\n";
91
92#open document_keyphrases.txt
93open(DOC_FILE, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
94 or die "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt could not be opened";
95
96#read a list of documents and the phrases that appear in them (& how many times they appear)
97while(<DOC_FILE>){
98 chomp;
99 my %document;
100 if(/(\w+)\:(\d+)((\|\d+,\d+)+)/){ #every line in the index of this form
101 my $ID = $1;
102 my $pairs = $3;
103 while ($pairs =~ s/\|(\d+),(\d+)//) { #this is the list of keyphrases and numbers
104 my $phrase_index = $1;
105 my $num_phrase = $2;
106 $document{"$phrase_index"} = "$num_phrase"; #table to be stored in another table
107 }
108 $document_phrases{"$ID"} = ({%document});
109 $N++; #number of documents in the collection (which have kea metadata)
110 }
111}
112
113close(DOC_FILE);
114
115#we do not want to recalculate measures ie if we have already calculated
116#doc1 & doc2 we do not then want to calculate doc2 & doc1
117#to achieve this we must first sort the list of ids so that both lists
118#of documents will be in the same order (otherwise hash tables return values in
119#no particular order). We then compare each document in the left
120#column below against the documents in the right column:
121
122#doc0 ->
123#doc1 -> doc0
124#doc2 -> doc0, doc 1
125#doc3 -> doc0, doc 1, doc 2
126#doc4 -> doc0, doc 1, doc 2, doc 3
127#doc5 -> doc0, doc 1, doc 2, doc 3, doc 4
128#etc
129
130#this way we are not recalculating the same value and also not
131#comparing each document to itself (which would result in the
132#value 1) thus more than halving the processing time.
133
134print STDERR "\ncalculating cosine measures for $N documents...\n";
135
136$count = 0;
137@ids = sort keys(%document_phrases);
138
139#repeat for all pairs of documents
140foreach $id1 (sort keys(%document_phrases)) {
141 my %table;
142
143 for($i=0; $i<$count; $i++){ #calculate the cosine measure and store
144 my $cosine_measure = &compare_docs($id1, $ids[$i]);
145 $table{"$ids[$i]"} = "$cosine_measure";
146 }
147 $cosine_matrix{"$id1"} = ({%table});
148 $count++; #each iteration the number of documents to compare against expands by 1
149}
150
151&addtogml(); #add the relational data we have just gathered to the documents gml files
152
153
154#this function takes as an argument two document IDs and uses these to extract the
155#keyphrases from the document_keyphrase data structure. It then calculates a
156#'measure of relativity' [0-1] with which we can establish how similar the two
157#documents are to each other. (0-not related, 1-same document). The equation
158#used to establish this similarity:
159#
160# for all phrases
161# in both d1, d2 (fd1p * loge(N/fp) (fd2p * loge(N/fp)
162# cosine(d1, d2) = --------------------------------------------------------------
163# sqrt(sum of phrases in d1(fd1p * loge(N/fp))) *
164# sqrt(sum of phrases in d2(fd2p * loge(N/fp)))
165#
166#where d1 and d2 are lists of keyphrase and represent documents
167#fd1p is the frequency that phrase p occurs in document d1
168#fd2p is the frequency that phrase p occurs in document d2
169#fp is the number of documents that have p as a keyphrase
170#N is the number of documents in the collection
171sub compare_docs {
172
173
174 my $ID1 = shift(@_); #ID for document one
175 my $ID2 = shift(@_); #ID for document two
176
177 my @phrases; #a list of phrases in document 1 and document 2
178 my @phrases1; #a list of phrases in document 1
179 my @phrases2; #a list of phrases in document 2
180
181
182 foreach $phrase (keys %{ $document_phrases{$ID1}}) { #list of phrases in doc1
183 push(@phrases1, $phrase);
184 }
185 foreach $phrase (keys %{ $document_phrases{$ID2}}) { #list of phrases in doc2
186 push(@phrases2, $phrase);
187 }
188 foreach $phrase1 (@phrases1) { #list holds intersection of doc1 and doc2
189 foreach $phrase2 (@phrases2) {
190 push(@phrases, $phrase1) if ($phrase1 == $phrase2);
191 }
192 }
193
194#COSINE MEASURE
195 my $wqtwdt= 0;
196
197 foreach $phrase (@phrases){ #for all phrases ocurring in d1 and d2
198 #the frequency that phrase occurs in document d1
199 $fd1p = $document_phrases{$ID1}{$phrase};
200
201 #log base e(N/the number of documents that have phrase as a keyphrase)
202 $log_freq = log($N/$phrases_document{$phrase});
203
204 #the frequency that phrase occurs in document d2
205 $fd2p = $document_phrases{$ID2}{$phrase};
206 $sum = ($fd1p * $log_freq) * ($fd2p * $log_freq);
207 $wqtwdt += $sum;
208 }
209
210
211 my $wd = 0; #stores the calculation for wd
212
213 foreach $phrase (@phrases1){ #for all phrases ocurring in d1 and d2
214 #the frequency that phrase occurs in document d1
215 $fd1p = $document_phrases{$ID1}{$phrase};
216
217 #log base e($N/the number of documents that have phrase as a keyphrase)
218 $log_freq = log($N/$phrases_document{$phrase});
219 $sum = $fd1p * $log_freq;
220 $wd += ($sum * $sum); #sum squared
221 }
222
223
224 my $wq = 0; #stores the calculation for wq
225
226 foreach $phrase (@phrases2){ #for all phrases ocurring in d2
227 #the frequency that phrase occurs in document d2
228 $fd2p = $document_phrases{$ID2}{$phrase};
229
230 #log base e($N/the number of documents that have phrase as a keyphrase)
231 $log_freq = log($N/$phrases_document{$phrase});
232 $sum = $fd2p * $log_freq;
233 $wq += ($sum * $sum); #sum squared
234 }
235
236 my $wdwq = sqrt($wq) * sqrt($wd);
237 my $cosine = $wqtwdt / $wdwq;
238
239 return $cosine;
240
241}
242
243#this function adds the relational data we have collected to the gml files
244sub addtogml {
245
246 print STDERR "\nadding relational data to $N gml documents...\n\n";
247 my @doclist;
248 my %filetable;
249 my $pattern = "kea="; #pattern to search for to find where to insert relation data
250
251 #open archive info for each collecticollection
252 foreach $collection (@directories){
253
254 open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
255 or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened";
256
257 #read a list of ID's and pathnames into a file table
258 while(<INFO>){
259 chomp;
260 my %idtable;
261 my $ID;
262 if(/(\w+)(\s+)([\w\.\/]+)/){ #format of the line
263 $ID = $1;
264 $path = $3;
265 $idtable{"$path"} = "$collection";
266 }
267 $filetable{"$ID"} = ({%idtable});
268 }
269
270 close(INFO);
271 }
272
273
274
275 #for each id in the matrix calculate a list of related documents
276 foreach $id (keys %cosine_matrix){
277
278 @doclist = &calculate_list($id); #gets list of documents with most relevant scores
279
280 my @path = keys %{$filetable{$id}}; #get filepath for document
281 my $collection = $filetable{$id}{$path[0]};
282
283 #open gml file to amend for each collection
284 open(FILE, "$gsdlhome/collect/$collection/archives/$path[0]")
285 or die "$gsdlhome/collect/$collection/archives/$path[0] could not be opened";
286
287 #read the gml file into text
288 $text = "";
289 while(<FILE>){
290 $text .= $_;
291 }
292
293 close(FILE);
294
295 #delete previous relational data and urllinks to files storing rel doc links
296 $text =~ s/(\s)+relation=\"([^\"]*)\"//g;
297 $text =~ s/(\s)+urllink=\"([^\"]*)\"//g;
298 $text =~ s/(\s)+\/urllink=\"([^\"]*)\"//g;
299
300 #if we want to insert each relation item as 'relation1="id" relation2="id"'
301 #$count = 1; #insert each item in the list
302 #foreach $item (@doclist){ #insert list into text
303 # $text =~ s/(\s)($pattern)/ relation$count=\"$item\" $2/g;
304 # $count++;
305 #}
306
307 #or relation="id, id, id"
308 #my $relation = join(",", @doclist);
309
310 #if pdf flag is set then delete pdf url document
311 if($do_pdfs == 1){
312 my $dirpath = $path[0];
313 $dirpath =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
314 `rm $gsdlhome/collect/$collection/index/assoc/$dirpath/url.txt`;
315 }
316
317 my $title = "related document";
318 $title = $2 if($text =~ /(\s)+Title=\"([^\"]*)\"/g);
319
320 #relation="collection,id collection,id"
321 my @relationlist;
322 my $relation = "";
323 foreach $doc (@doclist){
324 my @p = keys %{$filetable{$doc}};
325 my $collect = $filetable{$doc}{$p[0]};
326 push(@relationlist, "$collect,$doc");
327
328 #write pdf docs to file
329 if($do_pdfs == 1){
330 my @rel_path = keys %{$filetable{$doc}};
331 &write_related_urls($title, $collection, $path[0], $collect, $rel_path[0]);
332 }
333 }
334
335 #modify the text to include relation data
336 $relation = join(" ", @relationlist);
337 $text =~ s/($pattern)/ relation=\"$relation\" $1/g;
338
339 #open gml file to write back new text
340 open(FILE, ">$gsdlhome/collect/$collection/archives/$path[0]") or print STDERR "NO!!!!\n";
341 print FILE "$text"; #write back text
342 close(FILE);
343
344 #amend the pdf file
345 if($do_pdfs == 1){
346 my $pdf_path = $path[0];
347 $pdf_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
348 `perl amend_pdf.pl $gsdlhome/collect/$collection/index/assoc/$pdf_path/doc.pdf $gsdlhome/collect/$collection/index/assoc/$pdf_path/url.txt\n`;
349 }
350 }
351}
352
353#this function builds a list of cosine measures for each
354#document and sorts the list of ids each measure belongs to
355#in reverse order (ie docs with greatest cosine measure first)
356#returns the list calculated
357sub calculate_list {
358
359 my $document = shift(@_);
360 my %measures;
361 my @doclist;
362
363
364 #find the top $number_of_files for $document
365 foreach $id (keys %{$cosine_matrix{$document}}){
366 $measures{"$cosine_matrix{$document}{$id}"} = "$id";
367 }
368
369 my @list = reverse sort {$a<=>$b} keys %measures;
370
371 #list is as big as specified in command line (default is 1)
372 for($i = 0; $i<$number_of_files; $i++){
373 my $id = $measures{$list[$i]};
374 push(@doclist, $id) if($id ne "");
375 }
376
377 return @doclist;
378}
379
380
381sub write_related_urls {
382
383
384 #need server name eg nzdl2.cs.waikato.ac.nz
385 #localhost should work but doesn't on this computer?
386 #open up config file to read httpprefix
387 #get collection name of related document
388 #get hash directory of related document
389 #write http://nzdl2.cs.waikato.ac.nz/httpprefix/collectionname/index/assoc/directory/doc.pdf
390
391 my ($title, $collection, $path, $collect, $rel_path) = @_;
392
393 $rel_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
394 $path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
395
396 print STDERR "writing related pdf urls to file ";
397 print STDERR "$gsdlhome/collect/$collection/index/assoc/$path/url.txt...\n";
398
399 open(URL, ">>$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
400 or open(URL, ">$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
401 or print STDERR "This file $title is not a pdf file\n";
402
403 print URL "$title\t";
404 print URL "http://$servername/$httpprefix/collect/$collect/index/assoc/$rel_path/doc.pdf\n";
405 close URL;
406
407}
408
409
410
411
412
413
414
Note: See TracBrowser for help on using the repository browser.