source: main/trunk/greenstone2/bin/script/indexes/relation.pl@ 24874

Last change on this file since 24874 was 24874, checked in by ak19, 12 years ago

Third set of commits to do with the migration of cgi-bin into common-src, so that upon make install, common-src\cgi-bin will be installed in cgi-bin\GSDLOS(GSDLARCH). The first commit was of changes to files in cgi-bin itself. The second commit was moving cgi-bin. This one involves changes to all the files referring to cgi-bin where this needs to be changed to cgi-bin\OS_and_ARCH.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1#!/usr/bin/perl
2
3#data structures used:
4#-document_keyphrase
5#%document_phrases = ("hashID" => {"phrase" => "number of occurrences"})
6#-keyphrase_document
7#%phrases_document = ("phrase" => "no. of docs phrase occurs in")
8
9#after calculation relational data, it will amend gml files
10#to include this data while writing to a file ID_relatedlinks.txt
11#the html links of the related files
12#if option p is set then the user wishes to add the related documents
13#as bookmarks to the pdf file
14
15require "getopts.pl";
16&Getopts('N:P'); #process option arguments
17
18
19$gsdlhome = $ENV{'GSDLHOME'};
20@directories; #list of directories for the collections
21%phrases_document; #hash storing phrase index => no. of documents with phrase in it
22%document_phrases; #hash of hashes ID => {phrase number => no. of phrases in ID}
23$N = 0; #number of documents in the index (presumably in the collection)
24%cosine_matrix; #2-dimensional matrix, for each pair of documents stores a cosine measure
25$number_of_files = 1; #by default the top related file is returned
26$do_pdfs = 0;
27
28if($opt_N =~ /(\d)+/){ #if flag is set then return top N docs
29 $number_of_files = $opt_N;
30}
31
32#get server name and httpprefix
33$httpprefix;
34$servername = "nzdl2.cs.waikato.ac.nz"; #should be localhost
35
36if($opt_P == 1){ #if flag is set
37 $do_pdfs = 1;
38
39 #get the httpprefix from gsdlsite.cfg
40 my $cgibin = "cgi-bin/$ENV{'GSDLOS'}";
41 $cgibin = $cgibin.$ENV{'GSDLARCH'} if defined $ENV{'GSDLARCH';}
42 open(CFG, "$gsdlhome/$cgibin/gsdlsite.cfg")
43 or die "$gsdlhome/$cgibin/gsdlsite.cfg could not be opened";
44
45 while(<CFG>){
46 chomp;
47 if(/httpprefix \/(.+)/){
48 $httpprefix = $1;
49 }
50 }
51 close CFG;
52}
53
54if(@ARGV){
55
56 @directories = @ARGV;
57
58} else { #open collect directory and get a list of all collections
59 opendir(DIR, "$gsdlhome/collect");
60 @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
61 closedir(DIR);
62}
63
64
65
66#get the name of the directory the index will be stored in
67$dirname = "";
68foreach $collection (sort @directories){
69 $dirname .= $collection."_";
70}
71$dirname .= "indexes";
72print STDERR "directory name: $dirname\n";
73
74#open keyphrases_document.txt
75open(KEY_FILE, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
76 or die "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt could not be opened";
77
78print STDERR "\nreading keyphrase_document.txt...\n";
79
80#read a list of phrase indexes and number of documents that phrase appears in
81while(<KEY_FILE>){
82 chomp;
83 if(/(\d+)\:(\d+)(.*)/){ #all lines should be of this form
84 my $phrase_index = $1;
85 my $doc_num = $2;
86 $phrases_document{"$phrase_index"} = "$doc_num"; #build keyphrase hash
87 }
88}
89
90close(KEY_FILE);
91
92print STDERR "\nreading document_keyphrase.txt...\n";
93
94#open document_keyphrases.txt
95open(DOC_FILE, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
96 or die "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt could not be opened";
97
98#read a list of documents and the phrases that appear in them (& how many times they appear)
99while(<DOC_FILE>){
100 chomp;
101 my %document;
102 if(/(\w+)\:(\d+)((\|\d+,\d+)+)/){ #every line in the index of this form
103 my $ID = $1;
104 my $pairs = $3;
105 while ($pairs =~ s/\|(\d+),(\d+)//) { #this is the list of keyphrases and numbers
106 my $phrase_index = $1;
107 my $num_phrase = $2;
108 $document{"$phrase_index"} = "$num_phrase"; #table to be stored in another table
109 }
110 $document_phrases{"$ID"} = ({%document});
111 $N++; #number of documents in the collection (which have kea metadata)
112 }
113}
114
115close(DOC_FILE);
116
117#we do not want to recalculate measures ie if we have already calculated
118#doc1 & doc2 we do not then want to calculate doc2 & doc1
119#to achieve this we must first sort the list of ids so that both lists
120#of documents will be in the same order (otherwise hash tables return values in
121#no particular order). We then compare each document in the left
122#column below against the documents in the right column:
123
124#doc0 ->
125#doc1 -> doc0
126#doc2 -> doc0, doc 1
127#doc3 -> doc0, doc 1, doc 2
128#doc4 -> doc0, doc 1, doc 2, doc 3
129#doc5 -> doc0, doc 1, doc 2, doc 3, doc 4
130#etc
131
132#this way we are not recalculating the same value and also not
133#comparing each document to itself (which would result in the
134#value 1) thus more than halving the processing time.
135
136print STDERR "\ncalculating cosine measures for $N documents...\n";
137
138$count = 0;
139@ids = sort keys(%document_phrases);
140
141#repeat for all pairs of documents
142foreach $id1 (sort keys(%document_phrases)) {
143 my %table;
144
145 for($i=0; $i<$count; $i++){ #calculate the cosine measure and store
146 my $cosine_measure = &compare_docs($id1, $ids[$i]);
147 $table{"$ids[$i]"} = "$cosine_measure";
148 }
149 $cosine_matrix{"$id1"} = ({%table});
150 $count++; #each iteration the number of documents to compare against expands by 1
151}
152
153&addtogml(); #add the relational data we have just gathered to the documents gml files
154
155
156#this function takes as an argument two document IDs and uses these to extract the
157#keyphrases from the document_keyphrase data structure. It then calculates a
158#'measure of relativity' [0-1] with which we can establish how similar the two
159#documents are to each other. (0-not related, 1-same document). The equation
160#used to establish this similarity:
161#
162# for all phrases
163# in both d1, d2 (fd1p * loge(N/fp) (fd2p * loge(N/fp)
164# cosine(d1, d2) = --------------------------------------------------------------
165# sqrt(sum of phrases in d1(fd1p * loge(N/fp))) *
166# sqrt(sum of phrases in d2(fd2p * loge(N/fp)))
167#
168#where d1 and d2 are lists of keyphrase and represent documents
169#fd1p is the frequency that phrase p occurs in document d1
170#fd2p is the frequency that phrase p occurs in document d2
171#fp is the number of documents that have p as a keyphrase
172#N is the number of documents in the collection
173sub compare_docs {
174
175
176 my $ID1 = shift(@_); #ID for document one
177 my $ID2 = shift(@_); #ID for document two
178
179 my @phrases; #a list of phrases in document 1 and document 2
180 my @phrases1; #a list of phrases in document 1
181 my @phrases2; #a list of phrases in document 2
182
183
184 foreach $phrase (keys %{ $document_phrases{$ID1}}) { #list of phrases in doc1
185 push(@phrases1, $phrase);
186 }
187 foreach $phrase (keys %{ $document_phrases{$ID2}}) { #list of phrases in doc2
188 push(@phrases2, $phrase);
189 }
190 foreach $phrase1 (@phrases1) { #list holds intersection of doc1 and doc2
191 foreach $phrase2 (@phrases2) {
192 push(@phrases, $phrase1) if ($phrase1 == $phrase2);
193 }
194 }
195
196#COSINE MEASURE
197 my $wqtwdt= 0;
198
199 foreach $phrase (@phrases){ #for all phrases ocurring in d1 and d2
200 #the frequency that phrase occurs in document d1
201 $fd1p = $document_phrases{$ID1}{$phrase};
202
203 #log base e(N/the number of documents that have phrase as a keyphrase)
204 $log_freq = log($N/$phrases_document{$phrase});
205
206 #the frequency that phrase occurs in document d2
207 $fd2p = $document_phrases{$ID2}{$phrase};
208 $sum = ($fd1p * $log_freq) * ($fd2p * $log_freq);
209 $wqtwdt += $sum;
210 }
211
212
213 my $wd = 0; #stores the calculation for wd
214
215 foreach $phrase (@phrases1){ #for all phrases ocurring in d1 and d2
216 #the frequency that phrase occurs in document d1
217 $fd1p = $document_phrases{$ID1}{$phrase};
218
219 #log base e($N/the number of documents that have phrase as a keyphrase)
220 $log_freq = log($N/$phrases_document{$phrase});
221 $sum = $fd1p * $log_freq;
222 $wd += ($sum * $sum); #sum squared
223 }
224
225
226 my $wq = 0; #stores the calculation for wq
227
228 foreach $phrase (@phrases2){ #for all phrases ocurring in d2
229 #the frequency that phrase occurs in document d2
230 $fd2p = $document_phrases{$ID2}{$phrase};
231
232 #log base e($N/the number of documents that have phrase as a keyphrase)
233 $log_freq = log($N/$phrases_document{$phrase});
234 $sum = $fd2p * $log_freq;
235 $wq += ($sum * $sum); #sum squared
236 }
237
238 my $wdwq = sqrt($wq) * sqrt($wd);
239 my $cosine = $wqtwdt / $wdwq;
240
241 return $cosine;
242
243}
244
245#this function adds the relational data we have collected to the gml files
246sub addtogml {
247
248 print STDERR "\nadding relational data to $N gml documents...\n\n";
249 my @doclist;
250 my %filetable;
251 my $pattern = "kea="; #pattern to search for to find where to insert relation data
252
253 #open archive info for each collecticollection
254 foreach $collection (@directories){
255
256 open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
257 or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened";
258
259 #read a list of ID's and pathnames into a file table
260 while(<INFO>){
261 chomp;
262 my %idtable;
263 my $ID;
264 if(/(\w+)(\s+)([\w\.\/]+)/){ #format of the line
265 $ID = $1;
266 $path = $3;
267 $idtable{"$path"} = "$collection";
268 }
269 $filetable{"$ID"} = ({%idtable});
270 }
271
272 close(INFO);
273 }
274
275
276
277 #for each id in the matrix calculate a list of related documents
278 foreach $id (keys %cosine_matrix){
279
280 @doclist = &calculate_list($id); #gets list of documents with most relevant scores
281
282 my @path = keys %{$filetable{$id}}; #get filepath for document
283 my $collection = $filetable{$id}{$path[0]};
284
285 #open gml file to amend for each collection
286 open(FILE, "$gsdlhome/collect/$collection/archives/$path[0]")
287 or die "$gsdlhome/collect/$collection/archives/$path[0] could not be opened";
288
289 #read the gml file into text
290 $text = "";
291 while(<FILE>){
292 $text .= $_;
293 }
294
295 close(FILE);
296
297 #delete previous relational data and urllinks to files storing rel doc links
298 $text =~ s/(\s)+relation=\"([^\"]*)\"//g;
299 $text =~ s/(\s)+urllink=\"([^\"]*)\"//g;
300 $text =~ s/(\s)+\/urllink=\"([^\"]*)\"//g;
301
302 #if we want to insert each relation item as 'relation1="id" relation2="id"'
303 #$count = 1; #insert each item in the list
304 #foreach $item (@doclist){ #insert list into text
305 # $text =~ s/(\s)($pattern)/ relation$count=\"$item\" $2/g;
306 # $count++;
307 #}
308
309 #or relation="id, id, id"
310 #my $relation = join(",", @doclist);
311
312 #if pdf flag is set then delete pdf url document
313 if($do_pdfs == 1){
314 my $dirpath = $path[0];
315 $dirpath =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
316 `rm $gsdlhome/collect/$collection/index/assoc/$dirpath/url.txt`;
317 }
318
319 my $title = "related document";
320 $title = $2 if($text =~ /(\s)+Title=\"([^\"]*)\"/g);
321
322 #relation="collection,id collection,id"
323 my @relationlist;
324 my $relation = "";
325 foreach $doc (@doclist){
326 my @p = keys %{$filetable{$doc}};
327 my $collect = $filetable{$doc}{$p[0]};
328 push(@relationlist, "$collect,$doc");
329
330 #write pdf docs to file
331 if($do_pdfs == 1){
332 my @rel_path = keys %{$filetable{$doc}};
333 &write_related_urls($title, $collection, $path[0], $collect, $rel_path[0]);
334 }
335 }
336
337 #modify the text to include relation data
338 $relation = join(" ", @relationlist);
339 $text =~ s/($pattern)/ relation=\"$relation\" $1/g;
340
341 #open gml file to write back new text
342 open(FILE, ">$gsdlhome/collect/$collection/archives/$path[0]") or print STDERR "NO!!!!\n";
343 print FILE "$text"; #write back text
344 close(FILE);
345
346 #amend the pdf file
347 if($do_pdfs == 1){
348 my $pdf_path = $path[0];
349 $pdf_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
350 `perl amend_pdf.pl $gsdlhome/collect/$collection/index/assoc/$pdf_path/doc.pdf $gsdlhome/collect/$collection/index/assoc/$pdf_path/url.txt\n`;
351 }
352 }
353}
354
355#this function builds a list of cosine measures for each
356#document and sorts the list of ids each measure belongs to
357#in reverse order (ie docs with greatest cosine measure first)
358#returns the list calculated
359sub calculate_list {
360
361 my $document = shift(@_);
362 my %measures;
363 my @doclist;
364
365
366 #find the top $number_of_files for $document
367 foreach $id (keys %{$cosine_matrix{$document}}){
368 $measures{"$cosine_matrix{$document}{$id}"} = "$id";
369 }
370
371 my @list = reverse sort {$a<=>$b} keys %measures;
372
373 #list is as big as specified in command line (default is 1)
374 for($i = 0; $i<$number_of_files; $i++){
375 my $id = $measures{$list[$i]};
376 push(@doclist, $id) if($id ne "");
377 }
378
379 return @doclist;
380}
381
382
383sub write_related_urls {
384
385
386 #need server name eg nzdl2.cs.waikato.ac.nz
387 #localhost should work but doesn't on this computer?
388 #open up config file to read httpprefix
389 #get collection name of related document
390 #get hash directory of related document
391 #write http://nzdl2.cs.waikato.ac.nz/httpprefix/collectionname/index/assoc/directory/doc.pdf
392
393 my ($title, $collection, $path, $collect, $rel_path) = @_;
394
395 $rel_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
396 $path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
397
398 print STDERR "writing related pdf urls to file ";
399 print STDERR "$gsdlhome/collect/$collection/index/assoc/$path/url.txt...\n";
400
401 open(URL, ">>$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
402 or open(URL, ">$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
403 or print STDERR "This file $title is not a pdf file\n";
404
405 print URL "$title\t";
406 print URL "http://$servername/$httpprefix/collect/$collect/index/assoc/$rel_path/doc.pdf\n";
407 close URL;
408
409}
410
411
412
413
414
415
416
Note: See TracBrowser for help on using the repository browser.