#!/usr/bin/perl #data structures used: #-document_keyphrase #%document_phrases = ("hashID" => {"phrase" => "number of occurrences"}) #-keyphrase_document #%phrases_document = ("phrase" => "no. of docs phrase occurs in") #after calculation relational data, it will amend gml files #to include this data while writing to a file ID_relatedlinks.txt #the html links of the related files #if option p is set then the user wishes to add the related documents #as bookmarks to the pdf file require "getopts.pl"; &Getopts('N:P'); #process option arguments $gsdlhome = $ENV{'GSDLHOME'}; @directories; #list of directories for the collections %phrases_document; #hash storing phrase index => no. of documents with phrase in it %document_phrases; #hash of hashes ID => {phrase number => no. of phrases in ID} $N = 0; #number of documents in the index (presumably in the collection) %cosine_matrix; #2-dimensional matrix, for each pair of documents stores a cosine measure $number_of_files = 1; #by default the top related file is returned $do_pdfs = 0; if($opt_N =~ /(\d)+/){ #if flag is set then return top N docs $number_of_files = $opt_N; } #get server name and httpprefix $httpprefix; $servername = "nzdl2.cs.waikato.ac.nz"; #should be localhost if($opt_P == 1){ #if flag is set $do_pdfs = 1; #get the httpprefix from gsdlsite.cfg open(CFG, "$gsdlhome/cgi-bin/gsdlsite.cfg") or die "$gsdlhome/cgi-bin/gsdlsite.cfg could not be opened"; while(){ chomp; if(/httpprefix \/(.+)/){ $httpprefix = $1; } } close CFG; } if(@ARGV){ @directories = @ARGV; } else { #open collect directory and get a list of all collections opendir(DIR, "$gsdlhome/collect"); @directories = grep(!/(^\.|(CVS)|(modelcol))/, readdir(DIR)); closedir(DIR); } #get the name of the directory the index will be stored in $dirname = ""; foreach $collection (sort @directories){ $dirname .= $collection."_"; } $dirname .= "indexes"; print STDERR "directory name: $dirname\n"; #open keyphrases_document.txt open(KEY_FILE, "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt") or die "$gsdlhome/bin/script/indexes/$dirname/keyphrase_document.txt could not be opened"; print STDERR "\nreading keyphrase_document.txt...\n"; #read a list of phrase indexes and number of documents that phrase appears in while(){ chomp; if(/(\d+)\:(\d+)(.*)/){ #all lines should be of this form my $phrase_index = $1; my $doc_num = $2; $phrases_document{"$phrase_index"} = "$doc_num"; #build keyphrase hash } } close(KEY_FILE); print STDERR "\nreading document_keyphrase.txt...\n"; #open document_keyphrases.txt open(DOC_FILE, "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt") or die "$gsdlhome/bin/script/indexes/$dirname/document_keyphrase.txt could not be opened"; #read a list of documents and the phrases that appear in them (& how many times they appear) while(){ chomp; my %document; if(/(\w+)\:(\d+)((\|\d+,\d+)+)/){ #every line in the index of this form my $ID = $1; my $pairs = $3; while ($pairs =~ s/\|(\d+),(\d+)//) { #this is the list of keyphrases and numbers my $phrase_index = $1; my $num_phrase = $2; $document{"$phrase_index"} = "$num_phrase"; #table to be stored in another table } $document_phrases{"$ID"} = ({%document}); $N++; #number of documents in the collection (which have kea metadata) } } close(DOC_FILE); #we do not want to recalculate measures ie if we have already calculated #doc1 & doc2 we do not then want to calculate doc2 & doc1 #to achieve this we must first sort the list of ids so that both lists #of documents will be in the same order (otherwise hash tables return values in #no particular order). We then compare each document in the left #column below against the documents in the right column: #doc0 -> #doc1 -> doc0 #doc2 -> doc0, doc 1 #doc3 -> doc0, doc 1, doc 2 #doc4 -> doc0, doc 1, doc 2, doc 3 #doc5 -> doc0, doc 1, doc 2, doc 3, doc 4 #etc #this way we are not recalculating the same value and also not #comparing each document to itself (which would result in the #value 1) thus more than halving the processing time. print STDERR "\ncalculating cosine measures for $N documents...\n"; $count = 0; @ids = sort keys(%document_phrases); #repeat for all pairs of documents foreach $id1 (sort keys(%document_phrases)) { my %table; for($i=0; $i<$count; $i++){ #calculate the cosine measure and store my $cosine_measure = &compare_docs($id1, $ids[$i]); $table{"$ids[$i]"} = "$cosine_measure"; } $cosine_matrix{"$id1"} = ({%table}); $count++; #each iteration the number of documents to compare against expands by 1 } &addtogml(); #add the relational data we have just gathered to the documents gml files #this function takes as an argument two document IDs and uses these to extract the #keyphrases from the document_keyphrase data structure. It then calculates a #'measure of relativity' [0-1] with which we can establish how similar the two #documents are to each other. (0-not related, 1-same document). The equation #used to establish this similarity: # # for all phrases # in both d1, d2 (fd1p * loge(N/fp) (fd2p * loge(N/fp) # cosine(d1, d2) = -------------------------------------------------------------- # sqrt(sum of phrases in d1(fd1p * loge(N/fp))) * # sqrt(sum of phrases in d2(fd2p * loge(N/fp))) # #where d1 and d2 are lists of keyphrase and represent documents #fd1p is the frequency that phrase p occurs in document d1 #fd2p is the frequency that phrase p occurs in document d2 #fp is the number of documents that have p as a keyphrase #N is the number of documents in the collection sub compare_docs { my $ID1 = shift(@_); #ID for document one my $ID2 = shift(@_); #ID for document two my @phrases; #a list of phrases in document 1 and document 2 my @phrases1; #a list of phrases in document 1 my @phrases2; #a list of phrases in document 2 foreach $phrase (keys %{ $document_phrases{$ID1}}) { #list of phrases in doc1 push(@phrases1, $phrase); } foreach $phrase (keys %{ $document_phrases{$ID2}}) { #list of phrases in doc2 push(@phrases2, $phrase); } foreach $phrase1 (@phrases1) { #list holds intersection of doc1 and doc2 foreach $phrase2 (@phrases2) { push(@phrases, $phrase1) if ($phrase1 == $phrase2); } } #COSINE MEASURE my $wqtwdt= 0; foreach $phrase (@phrases){ #for all phrases ocurring in d1 and d2 #the frequency that phrase occurs in document d1 $fd1p = $document_phrases{$ID1}{$phrase}; #log base e(N/the number of documents that have phrase as a keyphrase) $log_freq = log($N/$phrases_document{$phrase}); #the frequency that phrase occurs in document d2 $fd2p = $document_phrases{$ID2}{$phrase}; $sum = ($fd1p * $log_freq) * ($fd2p * $log_freq); $wqtwdt += $sum; } my $wd = 0; #stores the calculation for wd foreach $phrase (@phrases1){ #for all phrases ocurring in d1 and d2 #the frequency that phrase occurs in document d1 $fd1p = $document_phrases{$ID1}{$phrase}; #log base e($N/the number of documents that have phrase as a keyphrase) $log_freq = log($N/$phrases_document{$phrase}); $sum = $fd1p * $log_freq; $wd += ($sum * $sum); #sum squared } my $wq = 0; #stores the calculation for wq foreach $phrase (@phrases2){ #for all phrases ocurring in d2 #the frequency that phrase occurs in document d2 $fd2p = $document_phrases{$ID2}{$phrase}; #log base e($N/the number of documents that have phrase as a keyphrase) $log_freq = log($N/$phrases_document{$phrase}); $sum = $fd2p * $log_freq; $wq += ($sum * $sum); #sum squared } my $wdwq = sqrt($wq) * sqrt($wd); my $cosine = $wqtwdt / $wdwq; return $cosine; } #this function adds the relational data we have collected to the gml files sub addtogml { print STDERR "\nadding relational data to $N gml documents...\n\n"; my @doclist; my %filetable; my $pattern = "kea="; #pattern to search for to find where to insert relation data #open archive info for each collecticollection foreach $collection (@directories){ open(INFO, "$gsdlhome/collect/$collection/archives/archives.inf") or die "$gsdlhome/collect/$collection/archives/archives.inf could not be opened"; #read a list of ID's and pathnames into a file table while(){ chomp; my %idtable; my $ID; if(/(\w+)(\s+)([\w\.\/]+)/){ #format of the line $ID = $1; $path = $3; $idtable{"$path"} = "$collection"; } $filetable{"$ID"} = ({%idtable}); } close(INFO); } #for each id in the matrix calculate a list of related documents foreach $id (keys %cosine_matrix){ @doclist = &calculate_list($id); #gets list of documents with most relevant scores my @path = keys %{$filetable{$id}}; #get filepath for document my $collection = $filetable{$id}{$path[0]}; #open gml file to amend for each collection open(FILE, "$gsdlhome/collect/$collection/archives/$path[0]") or die "$gsdlhome/collect/$collection/archives/$path[0] could not be opened"; #read the gml file into text $text = ""; while(){ $text .= $_; } close(FILE); #delete previous relational data and urllinks to files storing rel doc links $text =~ s/(\s)+relation=\"([^\"]*)\"//g; $text =~ s/(\s)+urllink=\"([^\"]*)\"//g; $text =~ s/(\s)+\/urllink=\"([^\"]*)\"//g; #if we want to insert each relation item as 'relation1="id" relation2="id"' #$count = 1; #insert each item in the list #foreach $item (@doclist){ #insert list into text # $text =~ s/(\s)($pattern)/ relation$count=\"$item\" $2/g; # $count++; #} #or relation="id, id, id" #my $relation = join(",", @doclist); #if pdf flag is set then delete pdf url document if($do_pdfs == 1){ my $dirpath = $path[0]; $dirpath =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/; `rm $gsdlhome/collect/$collection/index/assoc/$dirpath/url.txt`; } my $title = "related document"; $title = $2 if($text =~ /(\s)+Title=\"([^\"]*)\"/g); #relation="collection,id collection,id" my @relationlist; my $relation = ""; foreach $doc (@doclist){ my @p = keys %{$filetable{$doc}}; my $collect = $filetable{$doc}{$p[0]}; push(@relationlist, "$collect,$doc"); #write pdf docs to file if($do_pdfs == 1){ my @rel_path = keys %{$filetable{$doc}}; &write_related_urls($title, $collection, $path[0], $collect, $rel_path[0]); } } #modify the text to include relation data $relation = join(" ", @relationlist); $text =~ s/($pattern)/ relation=\"$relation\" $1/g; #open gml file to write back new text open(FILE, ">$gsdlhome/collect/$collection/archives/$path[0]") or print STDERR "NO!!!!\n"; print FILE "$text"; #write back text close(FILE); #amend the pdf file if($do_pdfs == 1){ my $pdf_path = $path[0]; $pdf_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/; `perl amend_pdf.pl $gsdlhome/collect/$collection/index/assoc/$pdf_path/doc.pdf $gsdlhome/collect/$collection/index/assoc/$pdf_path/url.txt\n`; } } } #this function builds a list of cosine measures for each #document and sorts the list of ids each measure belongs to #in reverse order (ie docs with greatest cosine measure first) #returns the list calculated sub calculate_list { my $document = shift(@_); my %measures; my @doclist; #find the top $number_of_files for $document foreach $id (keys %{$cosine_matrix{$document}}){ $measures{"$cosine_matrix{$document}{$id}"} = "$id"; } my @list = reverse sort {$a<=>$b} keys %measures; #list is as big as specified in command line (default is 1) for($i = 0; $i<$number_of_files; $i++){ my $id = $measures{$list[$i]}; push(@doclist, $id) if($id ne ""); } return @doclist; } sub write_related_urls { #need server name eg nzdl2.cs.waikato.ac.nz #localhost should work but doesn't on this computer? #open up config file to read httpprefix #get collection name of related document #get hash directory of related document #write http://nzdl2.cs.waikato.ac.nz/httpprefix/collectionname/index/assoc/directory/doc.pdf my ($title, $collection, $path, $collect, $rel_path) = @_; $rel_path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/; $path =~ s/(^(\/doc\.gml))*\/doc\.(gml)/$1/; print STDERR "writing related pdf urls to file "; print STDERR "$gsdlhome/collect/$collection/index/assoc/$path/url.txt...\n"; open(URL, ">>$gsdlhome/collect/$collection/index/assoc/$path/url.txt") or open(URL, ">$gsdlhome/collect/$collection/index/assoc/$path/url.txt") or print STDERR "This file $title is not a pdf file\n"; print URL "$title\t"; print URL "http://$servername/$httpprefix/collect/$collect/index/assoc/$rel_path/doc.pdf\n"; close URL; }