1 | #!/usr/bin/perl
|
---|
2 |
|
---|
3 | #data structures used:
|
---|
4 | #-document_keyphrase
|
---|
5 | #%document_phrases = ("hashID" => {"phrase" => "number of occurrences"})
|
---|
6 | #-keyphrase_document
|
---|
7 | #%phrases_document = ("phrase" => "no. of docs phrase occurs in")
|
---|
8 |
|
---|
9 | #after calculation relational data, it will amend gml files
|
---|
10 | #to include this data while writing to a file ID_relatedlinks.txt
|
---|
11 | #the html links of the related files
|
---|
12 | #if option p is set then the user wishes to add the related documents
|
---|
13 | #as bookmarks to the pdf file
|
---|
14 |
|
---|
15 | require "getopts.pl";
|
---|
16 | &Getopts('N:P'); #process option arguments
|
---|
17 |
|
---|
18 |
|
---|
19 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
20 | @directories; #list of directories for the collections
|
---|
21 | %phrases_document; #hash storing phrase index => no. of documents with phrase in it
|
---|
22 | %document_phrases; #hash of hashes ID => {phrase number => no. of phrases in ID}
|
---|
23 | $N = 0; #number of documents in the index (presumably in the collection)
|
---|
24 | %cosine_matrix; #2-dimensional matrix, for each pair of documents stores a cosine measure
|
---|
25 | $number_of_files = 1; #by default the top related file is returned
|
---|
26 | $do_pdfs = 0;
|
---|
27 |
|
---|
28 | if($opt_N =~ /(\d)+/){ #if flag is set then return top N docs
|
---|
29 | $number_of_files = $opt_N;
|
---|
30 | }
|
---|
31 |
|
---|
32 | #get server name and httpprefix
|
---|
33 | $httpprefix;
|
---|
34 | $servername = "nzdl2.cs.waikato.ac.nz"; #should be localhost
|
---|
35 |
|
---|
36 | if($opt_P == 1){ #if flag is set
|
---|
37 | $do_pdfs = 1;
|
---|
38 |
|
---|
39 | #get the httpprefix from gsdlsite.cfg
|
---|
40 | my $cgibin = "cgi-bin/$ENV{'GSDLOS'}";
|
---|
41 | $cgibin = $cgibin.$ENV{'GSDLARCH'} if defined $ENV{'GSDLARCH';}
|
---|
42 | open(CFG, "$gsdlhome/$cgibin/gsdlsite.cfg")
|
---|
43 | or die "$gsdlhome/$cgibin/gsdlsite.cfg could not be opened";
|
---|
44 |
|
---|
45 | while(<CFG>){
|
---|
46 | chomp;
|
---|
47 | if(/httpprefix \/(.+)/){
|
---|
48 | $httpprefix = $1;
|
---|
49 | }
|
---|
50 | }
|
---|
51 | close CFG;
|
---|
52 | }
|
---|
53 |
|
---|
54 | if(@ARGV){
|
---|
55 |
|
---|
56 | @directories = @ARGV;
|
---|
57 |
|
---|
58 | } else { #open collect directory and get a list of all collections
|
---|
59 | opendir(DIR, "$gsdlhome/collect");
|
---|
60 | @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR));
|
---|
61 | closedir(DIR);
|
---|
62 | }
|
---|
63 |
|
---|
64 |
|
---|
65 |
|
---|
66 | #get the name of the directory the index will be stored in
|
---|
67 | $dirname = "";
|
---|
68 | foreach $collection (sort @directories){
|
---|
69 | $dirname .= $collection."_";
|
---|
70 | }
|
---|
71 | $dirname .= "indexes";
|
---|
72 | print STDERR "directory name: $dirname\n";
|
---|
73 |
|
---|
74 | #open keyphrases_document.txt
|
---|
75 | open(KEY_FILE, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt")
|
---|
76 | or die "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt could not be opened";
|
---|
77 |
|
---|
78 | print STDERR "\nreading keyphrase_document.txt...\n";
|
---|
79 |
|
---|
80 | #read a list of phrase indexes and number of documents that phrase appears in
|
---|
81 | while(<KEY_FILE>){
|
---|
82 | chomp;
|
---|
83 | if(/(\d+)\:(\d+)(.*)/){ #all lines should be of this form
|
---|
84 | my $phrase_index = $1;
|
---|
85 | my $doc_num = $2;
|
---|
86 | $phrases_document{"$phrase_index"} = "$doc_num"; #build keyphrase hash
|
---|
87 | }
|
---|
88 | }
|
---|
89 |
|
---|
90 | close(KEY_FILE);
|
---|
91 |
|
---|
92 | print STDERR "\nreading document_keyphrase.txt...\n";
|
---|
93 |
|
---|
94 | #open document_keyphrases.txt
|
---|
95 | open(DOC_FILE, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt")
|
---|
96 | or die "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt could not be opened";
|
---|
97 |
|
---|
98 | #read a list of documents and the phrases that appear in them (& how many times they appear)
|
---|
99 | while(<DOC_FILE>){
|
---|
100 | chomp;
|
---|
101 | my %document;
|
---|
102 | if(/(\w+)\:(\d+)((\|\d+,\d+)+)/){ #every line in the index of this form
|
---|
103 | my $ID = $1;
|
---|
104 | my $pairs = $3;
|
---|
105 | while ($pairs =~ s/\|(\d+),(\d+)//) { #this is the list of keyphrases and numbers
|
---|
106 | my $phrase_index = $1;
|
---|
107 | my $num_phrase = $2;
|
---|
108 | $document{"$phrase_index"} = "$num_phrase"; #table to be stored in another table
|
---|
109 | }
|
---|
110 | $document_phrases{"$ID"} = ({%document});
|
---|
111 | $N++; #number of documents in the collection (which have kea metadata)
|
---|
112 | }
|
---|
113 | }
|
---|
114 |
|
---|
115 | close(DOC_FILE);
|
---|
116 |
|
---|
117 | #we do not want to recalculate measures ie if we have already calculated
|
---|
118 | #doc1 & doc2 we do not then want to calculate doc2 & doc1
|
---|
119 | #to achieve this we must first sort the list of ids so that both lists
|
---|
120 | #of documents will be in the same order (otherwise hash tables return values in
|
---|
121 | #no particular order). We then compare each document in the left
|
---|
122 | #column below against the documents in the right column:
|
---|
123 |
|
---|
124 | #doc0 ->
|
---|
125 | #doc1 -> doc0
|
---|
126 | #doc2 -> doc0, doc 1
|
---|
127 | #doc3 -> doc0, doc 1, doc 2
|
---|
128 | #doc4 -> doc0, doc 1, doc 2, doc 3
|
---|
129 | #doc5 -> doc0, doc 1, doc 2, doc 3, doc 4
|
---|
130 | #etc
|
---|
131 |
|
---|
132 | #this way we are not recalculating the same value and also not
|
---|
133 | #comparing each document to itself (which would result in the
|
---|
134 | #value 1) thus more than halving the processing time.
|
---|
135 |
|
---|
136 | print STDERR "\ncalculating cosine measures for $N documents...\n";
|
---|
137 |
|
---|
138 | $count = 0;
|
---|
139 | @ids = sort keys(%document_phrases);
|
---|
140 |
|
---|
141 | #repeat for all pairs of documents
|
---|
142 | foreach $id1 (sort keys(%document_phrases)) {
|
---|
143 | my %table;
|
---|
144 |
|
---|
145 | for($i=0; $i<$count; $i++){ #calculate the cosine measure and store
|
---|
146 | my $cosine_measure = &compare_docs($id1, $ids[$i]);
|
---|
147 | $table{"$ids[$i]"} = "$cosine_measure";
|
---|
148 | }
|
---|
149 | $cosine_matrix{"$id1"} = ({%table});
|
---|
150 | $count++; #each iteration the number of documents to compare against expands by 1
|
---|
151 | }
|
---|
152 |
|
---|
153 | &addtogml(); #add the relational data we have just gathered to the documents gml files
|
---|
154 |
|
---|
155 |
|
---|
156 | #this function takes as an argument two document IDs and uses these to extract the
|
---|
157 | #keyphrases from the document_keyphrase data structure. It then calculates a
|
---|
158 | #'measure of relativity' [0-1] with which we can establish how similar the two
|
---|
159 | #documents are to each other. (0-not related, 1-same document). The equation
|
---|
160 | #used to establish this similarity:
|
---|
161 | #
|
---|
162 | # for all phrases
|
---|
163 | # in both d1, d2 (fd1p * loge(N/fp) (fd2p * loge(N/fp)
|
---|
164 | # cosine(d1, d2) = --------------------------------------------------------------
|
---|
165 | # sqrt(sum of phrases in d1(fd1p * loge(N/fp))) *
|
---|
166 | # sqrt(sum of phrases in d2(fd2p * loge(N/fp)))
|
---|
167 | #
|
---|
168 | #where d1 and d2 are lists of keyphrase and represent documents
|
---|
169 | #fd1p is the frequency that phrase p occurs in document d1
|
---|
170 | #fd2p is the frequency that phrase p occurs in document d2
|
---|
171 | #fp is the number of documents that have p as a keyphrase
|
---|
172 | #N is the number of documents in the collection
|
---|
173 | sub compare_docs {
|
---|
174 |
|
---|
175 |
|
---|
176 | my $ID1 = shift(@_); #ID for document one
|
---|
177 | my $ID2 = shift(@_); #ID for document two
|
---|
178 |
|
---|
179 | my @phrases; #a list of phrases in document 1 and document 2
|
---|
180 | my @phrases1; #a list of phrases in document 1
|
---|
181 | my @phrases2; #a list of phrases in document 2
|
---|
182 |
|
---|
183 |
|
---|
184 | foreach $phrase (keys %{ $document_phrases{$ID1}}) { #list of phrases in doc1
|
---|
185 | push(@phrases1, $phrase);
|
---|
186 | }
|
---|
187 | foreach $phrase (keys %{ $document_phrases{$ID2}}) { #list of phrases in doc2
|
---|
188 | push(@phrases2, $phrase);
|
---|
189 | }
|
---|
190 | foreach $phrase1 (@phrases1) { #list holds intersection of doc1 and doc2
|
---|
191 | foreach $phrase2 (@phrases2) {
|
---|
192 | push(@phrases, $phrase1) if ($phrase1 == $phrase2);
|
---|
193 | }
|
---|
194 | }
|
---|
195 |
|
---|
196 | #COSINE MEASURE
|
---|
197 | my $wqtwdt= 0;
|
---|
198 |
|
---|
199 | foreach $phrase (@phrases){ #for all phrases ocurring in d1 and d2
|
---|
200 | #the frequency that phrase occurs in document d1
|
---|
201 | $fd1p = $document_phrases{$ID1}{$phrase};
|
---|
202 |
|
---|
203 | #log base e(N/the number of documents that have phrase as a keyphrase)
|
---|
204 | $log_freq = log($N/$phrases_document{$phrase});
|
---|
205 |
|
---|
206 | #the frequency that phrase occurs in document d2
|
---|
207 | $fd2p = $document_phrases{$ID2}{$phrase};
|
---|
208 | $sum = ($fd1p * $log_freq) * ($fd2p * $log_freq);
|
---|
209 | $wqtwdt += $sum;
|
---|
210 | }
|
---|
211 |
|
---|
212 |
|
---|
213 | my $wd = 0; #stores the calculation for wd
|
---|
214 |
|
---|
215 | foreach $phrase (@phrases1){ #for all phrases ocurring in d1 and d2
|
---|
216 | #the frequency that phrase occurs in document d1
|
---|
217 | $fd1p = $document_phrases{$ID1}{$phrase};
|
---|
218 |
|
---|
219 | #log base e($N/the number of documents that have phrase as a keyphrase)
|
---|
220 | $log_freq = log($N/$phrases_document{$phrase});
|
---|
221 | $sum = $fd1p * $log_freq;
|
---|
222 | $wd += ($sum * $sum); #sum squared
|
---|
223 | }
|
---|
224 |
|
---|
225 |
|
---|
226 | my $wq = 0; #stores the calculation for wq
|
---|
227 |
|
---|
228 | foreach $phrase (@phrases2){ #for all phrases ocurring in d2
|
---|
229 | #the frequency that phrase occurs in document d2
|
---|
230 | $fd2p = $document_phrases{$ID2}{$phrase};
|
---|
231 |
|
---|
232 | #log base e($N/the number of documents that have phrase as a keyphrase)
|
---|
233 | $log_freq = log($N/$phrases_document{$phrase});
|
---|
234 | $sum = $fd2p * $log_freq;
|
---|
235 | $wq += ($sum * $sum); #sum squared
|
---|
236 | }
|
---|
237 |
|
---|
238 | my $wdwq = sqrt($wq) * sqrt($wd);
|
---|
239 | my $cosine = $wqtwdt / $wdwq;
|
---|
240 |
|
---|
241 | return $cosine;
|
---|
242 |
|
---|
243 | }
|
---|
244 |
|
---|
245 | #this function adds the relational data we have collected to the gml files
|
---|
246 | sub addtogml {
|
---|
247 |
|
---|
248 | print STDERR "\nadding relational data to $N gml documents...\n\n";
|
---|
249 | my @doclist;
|
---|
250 | my %filetable;
|
---|
251 | my $pattern = "kea="; #pattern to search for to find where to insert relation data
|
---|
252 |
|
---|
253 | #open archive info for each collecticollection
|
---|
254 | foreach $collection (@directories){
|
---|
255 |
|
---|
256 | open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf")
|
---|
257 | or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened";
|
---|
258 |
|
---|
259 | #read a list of ID's and pathnames into a file table
|
---|
260 | while(<INFO>){
|
---|
261 | chomp;
|
---|
262 | my %idtable;
|
---|
263 | my $ID;
|
---|
264 | if(/(\w+)(\s+)([\w\.\/]+)/){ #format of the line
|
---|
265 | $ID = $1;
|
---|
266 | $path = $3;
|
---|
267 | $idtable{"$path"} = "$collection";
|
---|
268 | }
|
---|
269 | $filetable{"$ID"} = ({%idtable});
|
---|
270 | }
|
---|
271 |
|
---|
272 | close(INFO);
|
---|
273 | }
|
---|
274 |
|
---|
275 |
|
---|
276 |
|
---|
277 | #for each id in the matrix calculate a list of related documents
|
---|
278 | foreach $id (keys %cosine_matrix){
|
---|
279 |
|
---|
280 | @doclist = &calculate_list($id); #gets list of documents with most relevant scores
|
---|
281 |
|
---|
282 | my @path = keys %{$filetable{$id}}; #get filepath for document
|
---|
283 | my $collection = $filetable{$id}{$path[0]};
|
---|
284 |
|
---|
285 | #open gml file to amend for each collection
|
---|
286 | open(FILE, "$gsdlhome/collect/$collection/archives/$path[0]")
|
---|
287 | or die "$gsdlhome/collect/$collection/archives/$path[0] could not be opened";
|
---|
288 |
|
---|
289 | #read the gml file into text
|
---|
290 | $text = "";
|
---|
291 | while(<FILE>){
|
---|
292 | $text .= $_;
|
---|
293 | }
|
---|
294 |
|
---|
295 | close(FILE);
|
---|
296 |
|
---|
297 | #delete previous relational data and urllinks to files storing rel doc links
|
---|
298 | $text =~ s/(\s)+relation=\"([^\"]*)\"//g;
|
---|
299 | $text =~ s/(\s)+urllink=\"([^\"]*)\"//g;
|
---|
300 | $text =~ s/(\s)+\/urllink=\"([^\"]*)\"//g;
|
---|
301 |
|
---|
302 | #if we want to insert each relation item as 'relation1="id" relation2="id"'
|
---|
303 | #$count = 1; #insert each item in the list
|
---|
304 | #foreach $item (@doclist){ #insert list into text
|
---|
305 | # $text =~ s/(\s)($pattern)/ relation$count=\"$item\" $2/g;
|
---|
306 | # $count++;
|
---|
307 | #}
|
---|
308 |
|
---|
309 | #or relation="id, id, id"
|
---|
310 | #my $relation = join(",", @doclist);
|
---|
311 |
|
---|
312 | #if pdf flag is set then delete pdf url document
|
---|
313 | if($do_pdfs == 1){
|
---|
314 | my $dirpath = $path[0];
|
---|
315 | $dirpath =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
|
---|
316 | `rm $gsdlhome/collect/$collection/index/assoc/$dirpath/url.txt`;
|
---|
317 | }
|
---|
318 |
|
---|
319 | my $title = "related document";
|
---|
320 | $title = $2 if($text =~ /(\s)+Title=\"([^\"]*)\"/g);
|
---|
321 |
|
---|
322 | #relation="collection,id collection,id"
|
---|
323 | my @relationlist;
|
---|
324 | my $relation = "";
|
---|
325 | foreach $doc (@doclist){
|
---|
326 | my @p = keys %{$filetable{$doc}};
|
---|
327 | my $collect = $filetable{$doc}{$p[0]};
|
---|
328 | push(@relationlist, "$collect,$doc");
|
---|
329 |
|
---|
330 | #write pdf docs to file
|
---|
331 | if($do_pdfs == 1){
|
---|
332 | my @rel_path = keys %{$filetable{$doc}};
|
---|
333 | &write_related_urls($title, $collection, $path[0], $collect, $rel_path[0]);
|
---|
334 | }
|
---|
335 | }
|
---|
336 |
|
---|
337 | #modify the text to include relation data
|
---|
338 | $relation = join(" ", @relationlist);
|
---|
339 | $text =~ s/($pattern)/ relation=\"$relation\" $1/g;
|
---|
340 |
|
---|
341 | #open gml file to write back new text
|
---|
342 | open(FILE, ">$gsdlhome/collect/$collection/archives/$path[0]") or print STDERR "NO!!!!\n";
|
---|
343 | print FILE "$text"; #write back text
|
---|
344 | close(FILE);
|
---|
345 |
|
---|
346 | #amend the pdf file
|
---|
347 | if($do_pdfs == 1){
|
---|
348 | my $pdf_path = $path[0];
|
---|
349 | $pdf_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
|
---|
350 | `perl amend_pdf.pl $gsdlhome/collect/$collection/index/assoc/$pdf_path/doc.pdf $gsdlhome/collect/$collection/index/assoc/$pdf_path/url.txt\n`;
|
---|
351 | }
|
---|
352 | }
|
---|
353 | }
|
---|
354 |
|
---|
355 | #this function builds a list of cosine measures for each
|
---|
356 | #document and sorts the list of ids each measure belongs to
|
---|
357 | #in reverse order (ie docs with greatest cosine measure first)
|
---|
358 | #returns the list calculated
|
---|
359 | sub calculate_list {
|
---|
360 |
|
---|
361 | my $document = shift(@_);
|
---|
362 | my %measures;
|
---|
363 | my @doclist;
|
---|
364 |
|
---|
365 |
|
---|
366 | #find the top $number_of_files for $document
|
---|
367 | foreach $id (keys %{$cosine_matrix{$document}}){
|
---|
368 | $measures{"$cosine_matrix{$document}{$id}"} = "$id";
|
---|
369 | }
|
---|
370 |
|
---|
371 | my @list = reverse sort {$a<=>$b} keys %measures;
|
---|
372 |
|
---|
373 | #list is as big as specified in command line (default is 1)
|
---|
374 | for($i = 0; $i<$number_of_files; $i++){
|
---|
375 | my $id = $measures{$list[$i]};
|
---|
376 | push(@doclist, $id) if($id ne "");
|
---|
377 | }
|
---|
378 |
|
---|
379 | return @doclist;
|
---|
380 | }
|
---|
381 |
|
---|
382 |
|
---|
383 | sub write_related_urls {
|
---|
384 |
|
---|
385 |
|
---|
386 | #need server name eg nzdl2.cs.waikato.ac.nz
|
---|
387 | #localhost should work but doesn't on this computer?
|
---|
388 | #open up config file to read httpprefix
|
---|
389 | #get collection name of related document
|
---|
390 | #get hash directory of related document
|
---|
391 | #write http://nzdl2.cs.waikato.ac.nz/httpprefix/collectionname/index/assoc/directory/doc.pdf
|
---|
392 |
|
---|
393 | my ($title, $collection, $path, $collect, $rel_path) = @_;
|
---|
394 |
|
---|
395 | $rel_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
|
---|
396 | $path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/;
|
---|
397 |
|
---|
398 | print STDERR "writing related pdf urls to file ";
|
---|
399 | print STDERR "$gsdlhome/collect/$collection/index/assoc/$path/url.txt...\n";
|
---|
400 |
|
---|
401 | open(URL, ">>$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
|
---|
402 | or open(URL, ">$gsdlhome/collect/$collection/index/assoc/$path/url.txt")
|
---|
403 | or print STDERR "This file $title is not a pdf file\n";
|
---|
404 |
|
---|
405 | print URL "$title\t";
|
---|
406 | print URL "http://$servername/$httpprefix/collect/$collect/index/assoc/$rel_path/doc.pdf\n";
|
---|
407 | close URL;
|
---|
408 |
|
---|
409 | }
|
---|
410 |
|
---|
411 |
|
---|
412 |
|
---|
413 |
|
---|
414 |
|
---|
415 |
|
---|
416 |
|
---|