Changeset 9398 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2005-03-14T09:44:10+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r9351 r9398 26 26 package BasPlug; 27 27 28 BEGIN { 29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; 30 } 31 28 32 eval {require bytes}; 29 33 … … 48 52 use printusage; 49 53 54 use GISBasPlug; 55 56 @ISA = ( GISBasPlug ); 50 57 51 58 my $unicode_list = … … 106 113 'desc' => "{BasPlug.markup_acronyms}", 107 114 'type' => "flag", 108 'reqd' => "no" }, 115 'reqd' => "no" }, 109 116 { 'name' => "extract_keyphrases", 110 117 'desc' => "{BasPlug.extract_keyphrases}", … … 147 154 'reqd' => "no" } ]; 148 155 156 my $gis_arguments = 157 [ { 'name' => "extract_placenames", 158 'desc' => "{GISBasPlug.extract_placenames}", 159 'type' => "flag", 160 'reqd' => "no" }, 161 { 'name' => "gazetteer", 162 'desc' => "{GISBasPlug.gazetteer}", 163 'type' => "string", 164 'reqd' => "no" }, 165 { 'name' => "place_list", 166 'desc' => "{GISBasPlug.place_list}", 167 'type' => "flag", 168 'reqd' => "no" } ]; 169 170 149 171 my $options = { 'name' => "BasPlug", 150 172 'desc' => "{BasPlug.desc}", … … 301 323 my $self = {}; 302 324 $self->{'plugin_type'} = "BasPlug"; 325 326 if (GISBasPlug::has_mapdata()) { 327 push(@$arguments,@$gis_arguments); 328 } 329 303 330 my $enc = "^("; 304 331 map {$enc .= "$_|";} keys %$encodings::encodings; … … 328 355 q^extract_language^, \$self->{'extract_language'}, 329 356 q^extract_acronyms^, \$self->{'extract_acronyms'}, 330 q^extract_keyphrases^, \$self->{'kea'}, 331 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, 357 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED) 358 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED) 332 359 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 333 360 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'}, 334 361 q^extract_email^, \$self->{'extract_email'}, 362 q^extract_placenames^, \$self->{'extract_placenames'}, 363 q^gazetteer/.*/^, \$self->{'gazetteer'}, 364 q^place_list^, \$self->{'place_list'}, 335 365 q^markup_acronyms^, \$self->{'markup_acronyms'}, 336 366 q^default_language/.{2}/en^, \$self->{'default_language'}, … … 367 397 368 398 $self->{'cover_image'} = 0 if ($no_cover_image); 369 399 400 if ($self->{'extract_placenames'}) { 401 402 my $outhandle = $self->{'outhandle'}; 403 404 my $places_ref 405 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'}); 406 407 if (!defined $places_ref) { 408 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n"; 409 print $outhandle " No placename extraction will take place.\n"; 410 $self->{'extract_placenames'} = undef; 411 } 412 else { 413 $self->{'places'} = $places_ref; 414 } 415 } 370 416 return bless $self, $class; 371 417 } … … 626 672 my $smart_block = $self->{'smart_block'}; 627 673 my $smart_block_BN = $self->{'smart_block_BN'}; 628 674 629 675 my $filename = $file; 630 676 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; … … 663 709 664 710 my ($filemeta) = $file =~ /([^\\\/]+)$/; 665 666 711 # how do we know what encoding the filename is in? 667 712 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); … … 690 735 691 736 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 692 737 693 738 # do plugin specific processing of doc_obj 694 739 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { … … 699 744 $text=''; 700 745 undef $text; 701 746 702 747 # do any automatic metadata extraction 703 748 $self->auto_extract_metadata ($doc_obj); … … 716 761 $processor->process($doc_obj); 717 762 763 if(defined($self->{'places_filename'})){ 764 &util::rm($self->{'places_filename'}); 765 $self->{'places_filename'} = undef; 766 } 767 718 768 $self->{'num_processed'} ++; 719 769 undef $doc_obj; … … 784 834 return $file_derived_title; 785 835 } 836 786 837 787 838 sub title_fallback … … 952 1003 foreach my $field (keys(%$metadata)) { 953 1004 # $metadata->{$field} may be an array reference 954 955 1005 if ($field eq "gsdlassocfile_tobe") { 956 1006 # 'gsdlassocfile_tobe' is artificially introduced metadata … … 1063 1113 } 1064 1114 } 1065 1115 if ($self->{'extract_placenames'}) { 1116 my $thissection = $doc_obj->get_top_section(); 1117 while (defined $thissection) { 1118 my $text = $doc_obj->get_text($thissection); 1119 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./; 1120 $thissection = $doc_obj->get_next_section ($thissection); 1121 } 1122 } 1066 1123 1067 1124 # adding kea keyphrases 1068 1069 if ($self->{'kea'}) { 1125 if ($self->{'kea'}) { 1070 1126 1071 1127 my $thissection = $doc_obj->get_top_section(); … … 1079 1135 $thissection = $doc_obj->get_next_section ($thissection); 1080 1136 } 1081 1082 1083 if($self->{'kea_options'}) { 1137 1138 if ($self->{'kea_options'}) { 1084 1139 #if kea options flag is set, call Kea with specified options 1085 1140 $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); … … 1088 1143 $list = &Kea::extract_KeyPhrases ($text); 1089 1144 } 1090 if ($list){1145 if ($list){ 1091 1146 # if a list of kea keyphrases was returned (ie not empty) 1092 1147 &gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n"); … … 1104 1159 } 1105 1160 } 1106 } 1107 1108 #end of kea 1161 } #end of kea 1109 1162 1110 1163 if ($self->{'first'}) {
Note:
See TracChangeset
for help on using the changeset viewer.