source: gsdl/trunk/perllib/plugins/GISExtractor.pm@ 15918

Last change on this file since 15918 was 15881, checked in by kjdon, 16 years ago

auxiliary plugins now pass an extra argument to the PrintInfo constructor so that argument parsing is not done - parsing needs to be done by the main plugin which has all the arguments

  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1###########################################################################
2#
3# GISExtractor.pm -- extension base class to enhance plugins with GIS capabilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package GISExtractor;
27
28use PrintInfo;
29
30use util;
31
32use gsprintf 'gsprintf';
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35no strict 'subs';
36
37#field categories in DataBase files
38#$LAT = 3;
39#$LONG = 4;
40my $FC = 9;
41my $DSG = 10;
42#$CC1 = 12;
43my $FULL_NAME = 22;
44
45BEGIN {
46 @GISExtractor::ISA = ('PrintInfo');
47}
48
49
50my $arguments =
51 [ { 'name' => "extract_placenames",
52 'desc' => "{GISExtractor.extract_placenames}",
53 'type' => "flag",
54 'reqd' => "no" },
55 { 'name' => "gazetteer",
56 'desc' => "{GISExtractor.gazetteer}",
57 'type' => "string",
58 'reqd' => "no" },
59 { 'name' => "place_list",
60 'desc' => "{GISExtractor.place_list}",
61 'type' => "flag",
62 'reqd' => "no" } ];
63
64
65my $options = { 'name' => "GISExtractor",
66 'desc' => "{GISExtractor.desc}",
67 'abstract' => "yes",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71
72sub new {
73 my ($class) = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 # can we indicate that these are not available if the map data is not there??
78 #if (has_mapdata()) {
79 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
80 push(@{$hashArgOptLists->{"OptList"}},$options);
81 #}
82 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
83
84 if ($self->{'extract_placenames'}) {
85
86 my $outhandle = $self->{'outhandle'};
87
88 my $places_ref
89 = $self->loadGISDatabase($outhandle,$self->{'gazetteer'});
90
91 if (!defined $places_ref) {
92 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
93 print $outhandle " No placename extraction will take place.\n";
94 $self->{'extract_placenames'} = undef;
95 }
96 else {
97 $self->{'places'} = $places_ref;
98 }
99 }
100
101 return bless $self, $class;
102
103}
104
105
106sub extract_gis_metadata
107{
108 my $self = shift (@_);
109 my ($doc_obj) = @_;
110
111 if ($self->{'extract_placenames'}) {
112 my $thissection = $doc_obj->get_top_section();
113 while (defined $thissection) {
114 my $text = $doc_obj->get_text($thissection);
115 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
116 $thissection = $doc_obj->get_next_section ($thissection);
117 }
118 }
119
120}
121
122sub has_mapdata
123{
124 my $db_dir = &util::filename_cat($ENV{'GSDLHOME'}, "lamp", "data");
125 return ( -d $db_dir );
126}
127
128
129#returns a hash table of names from database files (specified in collect.cfg).
130sub loadGISDatabase {
131 my ($outhandle,$datasets) = @_;
132 my @dbase = map{$_ = $_ . ".txt";} split(/,/, $datasets);
133 if(scalar(@dbase)==0) { #default is to include all databases
134 @dbase=("UK.txt", "NZ.txt", "NF.txt", "CA.txt", "AS.txt", "GM.txt", "BE.txt", "IN.txt", "JA.txt", "USA.txt");
135 }
136 my $counter=0;
137 my %places = ();
138 my @cats = ();
139 while($counter <= $#dbase){ #loop through all the databases
140 my $folder = $dbase[$counter];
141 $folder =~ s/(.*?)\.txt/$1/;
142#### my $dbName = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "mapdata", "data", $folder, $dbase[$counter]);
143 my $dbName = &util::filename_cat($ENV{'GSDLHOME'}, "lamp", "data", $folder, $dbase[$counter]);
144 if (!open(FILEIN, "<$dbName")) {
145 print $outhandle "Unable to open database $dbName: $!\n";
146 return undef;
147 }
148
149 my $line = <FILEIN>; #database category details.
150 my @catdetails = split("\t", $line);
151
152 while ( defined($line = <FILEIN>)){ #stores all place names in file as keys in hash array %places
153 @cats = split("\t", $line);
154 if( #eliminating "bad" place names without missing real ones
155 ($cats[$FC] eq "A" && !($cats[$DSG] =~ /ADMD|ADM2|PRSH/))
156 ||($cats[$FC] eq "P" && ($cats[$DSG] =~ /PPLA|PPLC|PPLS/))
157 ||($cats[$FC] eq "H" && ($cats[$DSG] =~ /BAY|LK/))
158 ||($cats[$FC] eq "L" && !($cats[$DSG] =~ /LCTY|RGN/))
159 ||($cats[$FC] eq "T" && ($cats[$DSG] =~ /CAPE|ISL|GRGE/))
160 ||($dbase[$counter] eq "USA.txt" && ($cats[$DSG] =~ /ppl|island/))
161 ){$places{$cats[$FULL_NAME]} = [@cats];}
162 @cats = ();
163 }
164 close(FILEIN);
165 $counter++;
166 }
167 return \%places;
168}
169
170#returns a unique hash array of all the places found in a document, along with coordinates, description and country data
171sub getPlaces {
172 my $self = shift @_;
173
174 my ($textref, $places) = @_;
175 my %tempPlaces = ();
176
177 foreach my $plc (%$places){ #search for an occurrence of each place in the text
178 if($$textref =~ m/(\W)$plc(\W)/){
179 $tempPlaces{$plc} = $places->{$plc};
180 }
181 }
182 #make sure each place is only there once
183 my %uniquePlaces = ();
184 foreach my $p (keys %tempPlaces) {
185 if(!defined($uniquePlaces{$p})){
186 $uniquePlaces{$p} = $tempPlaces{$p};
187 }
188 }
189 return \%uniquePlaces;
190}
191
192
193
194
195#returns a lowercase version of the place, with no spaces
196sub placename_to_anchorname {
197 my ($placename) = @_;
198 my $p_tag = lc($placename);
199 $p_tag =~ s/\s+//g;
200 return $p_tag;
201}
202
203#takes a place from the text and wraps an anchor tag, a hyperlink tag and an image tag around it
204sub anchor_wrapper {
205 my ($p, $p_counter_ref, $path) = @_;
206 my $image = "/gsdlgis/gisimages/nextplace.gif";
207 my $endTag = "</a>";
208 my $hrefTag = "<a href=";
209 $$p_counter_ref++;
210 my $next = $$p_counter_ref + 1;
211 my $p_tag = placename_to_anchorname($p);
212 my $place_anchor = "<a name=\"" . $p_tag . $$p_counter_ref . "\">" . $endTag;
213 my $popup_anchor = $hrefTag . "'" . $path . "'>" . $p . $endTag;
214 my $image_anchor = $hrefTag . "#" . $p_tag . $next . "><img src=\"" . $image . "\" name=\"img" . $p_tag . $$p_counter_ref . "\" border=\"0\">" . $endTag;
215 return $place_anchor . $popup_anchor . $image_anchor;
216}
217
218#takes dangerous place names and checks if they are part of another placename or not.
219sub place_name_check {
220 my ($pre, $preSpace, $p, $postSpace, $post, $p_counter_ref, $path, $y) = @_;
221 if($pre =~ /$y/ || $post =~ /$y/) {return $pre . $preSpace . $p . $postSpace . $post;}
222 $pre = $pre . $preSpace;
223 $post = $postSpace . $post;
224 return $pre . &anchor_wrapper("", $p, "", $p_counter_ref, $path) . $post;
225}
226
227sub extract_placenames {
228 my $self = shift (@_);
229 my ($textref, $doc_obj, $thissection) = @_;
230 my $outhandle = $self->{'outhandle'};
231
232 my $GSDLHOME = $ENV{'GSDLHOME'};
233 #field categories in DataBase file for extract_placenames.
234 my $LAT = 3;
235 my $LONG = 4;
236 my $CC1 = 12;
237
238
239 &gsprintf($outhandle, " {BasPlug.extracting_placenames}...\n")
240 if ($self->{'verbosity'} > 2);
241
242 #get all the places found in the document
243 my $uniquePlaces = $self->getPlaces($textref, $self->{'places'});
244
245 #finds 'dangerous' placenames (eg York and New York). Dangerous because program will find "York" within "New York"
246 my %danger = ();
247 foreach my $x (keys %$uniquePlaces){
248 foreach my $y (keys %$uniquePlaces){
249 if(($y =~ m/ /) && ($y =~ m/$x/) && ($y ne $x)){
250 $y =~ s/($x\s)|(\s$x)//;
251 $danger{$x} = $y;
252 }
253 }
254 }
255
256 #creates a list of clickable placenames at top of page, linked to first occurrence of name, and reads them into a file
257 my $tempfname = $doc_obj;
258 $tempfname =~ s/.*\(0x(.*)\)/$1/;
259 my $names = "";
260 my $filename = "tmpfile" . $tempfname;
261 my $tempfile = &util::filename_cat($GSDLHOME, "tmp", $filename);
262 open(FOUT, ">$tempfile") || die "Unable to create a temp file: $!";
263 foreach my $name (sort (keys %$uniquePlaces)){
264 if(!defined($danger{$name})){
265 my $name_tag = placename_to_anchorname($name);
266 print FOUT "$name\t" . $uniquePlaces->{$name}->[$LONG] . "\t" . $uniquePlaces->{$name}->[$LAT] . "\n";
267 if($self->{'place_list'}) {$names = $names . "<a href=\"#" . $name_tag . "1\">" . $name . "</a>" . "\n";}
268 }
269 }
270 close(FOUT);
271 $doc_obj->associate_file($tempfile, "places.txt", "text/plain");
272 $self->{'places_filename'} = $tempfile;
273
274 my %countries = ();
275
276 foreach my $p (keys %$uniquePlaces){
277 my $place = $p;
278 $place =~ s/\s+|\n+|\r+|\t+/(\\s+)/g;
279 my $cap_place = uc($place);
280 my $long = $uniquePlaces->{$p}->[$LONG];
281 my $lat = $uniquePlaces->{$p}->[$LAT];
282 my $country = $uniquePlaces->{$p}->[$CC1];
283 my $path = "javascript:popUp(\"$long\",\"$lat\",\"$p\",\"$country\")";
284 my $p_counter = 0;
285
286 if(!defined($danger{$p})){
287 #adds html tags to each place name
288 $$textref =~ s/\b($place|$cap_place)\b/&anchor_wrapper($1,\$p_counter,$path)/sge;
289 }
290 #else {
291 #$y = $danger{$p};
292 #$$textref =~ s/(\w+)(\s+?)($place|$cap_place)(\s+?)(\w+)/&place_name_check($1,$2,$3,$4,$5,\$p_counter,$path, $y)/sge;
293 #}
294
295 #edits the last place's image, and removes image if place only occurres once.
296 my $p_tag = placename_to_anchorname($p);
297 $p_counter++;
298 $$textref =~ s/#$p_tag$p_counter(><img src="\/gsdl\/images\/)nextplace.gif/#${p_tag}1$1firstplace.gif/;
299 $$textref =~ s/<img src="\/gsdl\/images\/firstplace.gif" name="img$p_tag(1)" border="0">//;
300
301 #this line removes apostrophes from placenames (they break the javascript function)
302 $$textref =~ s/(javascript:popUp.*?)(\w)'(\w)/$1$2$3/g; #' (to get emacs colours back)
303
304 #for displaying map of document, count num of places from each country
305 if(defined($countries{$country})){$countries{$country}++;}
306 else{$countries{$country} = 1;}
307
308 #adds placename to metadata
309 $doc_obj->add_utf8_metadata ($thissection, "Placename", $p);
310 &gsprintf($outhandle, " {BasPlug.extracting} $p\n")
311 if ($self->{'verbosity'} > 3);
312 }
313 #finding the country that most places are from, in order to display map of the document
314 my $max = 0;
315 my $CNTRY = "";
316 foreach my $c_key (keys %countries){
317 if($countries{$c_key} > $max){
318 $max = $countries{$c_key};
319 $CNTRY = $c_key;
320 }
321 }
322 #allows user to view map with all places from the document on it
323#### my $places_filename = &util::filename_cat($GSDLHOME, "collect", "_cgiargc_", "index", "assoc", "_thisOID_", "places.txt");
324 my $places_filename = &util::filename_cat("collect", "_cgiargc_", "index", "assoc", "_thisOID_", "places.txt");
325 my $docmap = "<a href='javascript:popUp(\"$CNTRY\",\"$places_filename\")'>View map for this document<\/a><br><br>\n";
326 $$textref = $docmap . $names . "<br>" . $$textref;
327
328 $doc_obj->delete_text($thissection);
329 $doc_obj->add_utf8_text($thissection, $$textref);
330 &gsprintf($outhandle, " {BasPlug.done_places_extract}\n")
331 if ($self->{'verbosity'} > 2);
332}
333
334sub clean_up_temp_files {
335 my $self = shift(@_);
336
337 if(defined($self->{'places_filename'}) && -e $self->{'places_filename'}){
338 &util::rm($self->{'places_filename'});
339 }
340 $self->{'places_filename'} = undef;
341
342}
Note: See TracBrowser for help on using the repository browser.