Changeset 9413 for trunk/gsdl/perllib/plugins
- Timestamp:
- 2005-03-14T14:05:28+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r9403 r9413 6 6 # University of Waikato, New Zealand. 7 7 # 8 # Copyright (C) 1999 New Zealand Digital Library Project8 # Copyright (C) 1999-2005 New Zealand Digital Library Project 9 9 # 10 10 # This program is free software; you can redistribute it and/or modify … … 36 36 $SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)}; 37 37 38 use strict; no strict 'subs'; 39 38 40 use File::Basename; 39 41 … … 49 51 use DateExtract; 50 52 use ghtml; 51 use gsprintf ;53 use gsprintf 'gsprintf'; 52 54 use printusage; 53 55 54 56 use GISBasPlug; 55 57 56 @ ISA = ( GISBasPlug );58 @BasPlug::ISA = ( GISBasPlug ); 57 59 58 60 my $unicode_list = … … 176 178 177 179 178 sub gsprintf179 {180 return &gsprintf::gsprintf(@_);181 }182 180 183 181 … … 198 196 199 197 # XML output is always in UTF-8 200 &gsprintf::output_strings_in_UTF8;201 202 &PrintUsage::print_xml_header();198 gsprintf::output_strings_in_UTF8; 199 200 PrintUsage::print_xml_header(); 203 201 $self->print_xml(); 204 202 } … … 214 212 return if (!defined($pluginoptions)); 215 213 216 &gsprintf(STDERR, "<PlugInfo>\n");217 &gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");218 my $desc = &gsprintf::lookup_string($pluginoptions->{'desc'});214 gsprintf(STDERR, "<PlugInfo>\n"); 215 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n"); 216 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'}); 219 217 $desc =~ s/</&lt;/g; # doubly escaped 220 218 $desc =~ s/>/&gt;/g; 221 219 222 &gsprintf(STDERR, " <Desc>$desc</Desc>\n");223 &gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");224 &gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");225 &gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");226 &gsprintf(STDERR, " <Arguments>\n");220 gsprintf(STDERR, " <Desc>$desc</Desc>\n"); 221 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n"); 222 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n"); 223 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n"); 224 gsprintf(STDERR, " <Arguments>\n"); 227 225 if (defined($pluginoptions->{'args'})) { 228 226 &PrintUsage::print_options_xml($pluginoptions->{'args'}); … … 232 230 $self->print_xml(); 233 231 234 &gsprintf(STDERR, " </Arguments>\n");235 &gsprintf(STDERR, "</PlugInfo>\n");232 gsprintf(STDERR, " </Arguments>\n"); 233 gsprintf(STDERR, "</PlugInfo>\n"); 236 234 } 237 235 … … 291 289 if ($isleafclass) { 292 290 if (defined($plugindesc)) { 293 &gsprintf(STDERR, "$plugindesc\n\n");294 } 295 &gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");291 gsprintf(STDERR, "$plugindesc\n\n"); 292 } 293 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n"); 296 294 } 297 295 … … 302 300 303 301 if ($isleafclass) { 304 &gsprintf(STDERR, " {common.specific_options}:\n");302 gsprintf(STDERR, " {common.specific_options}:\n"); 305 303 } 306 304 else { 307 &gsprintf(STDERR, " {common.general_options}:\n", $pluginname);305 gsprintf(STDERR, " {common.general_options}:\n", $pluginname); 308 306 } 309 307 … … 376 374 "allow_extra_options")) { 377 375 378 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);376 gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name); 379 377 bless $self, $class; 380 378 $self->print_txt_usage(""); # Use default resource bundle … … 667 665 668 666 if ($self->is_recursive()) { 669 &gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";667 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n"; 670 668 } 671 669 … … 722 720 if (!length ($text)) { 723 721 my $plugin_name = ref ($self); 724 &gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};722 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'}; 725 723 726 724 my $failhandle = $self->{'failhandle'}; 727 &gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");725 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n"); 728 726 # print $failhandle "$file: " . ref($self) . ": file contains no text\n"; 729 727 $self->{'num_not_processed'} ++; … … 777 775 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 778 776 779 &gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";777 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n"; 780 778 # die "Basplug::process function must be implemented in sub-class\n"; 781 779 … … 794 792 { 795 793 my $outhandle = $self->{'outhandle'}; 796 &gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};794 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'}; 797 795 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 798 796 return; … … 801 799 $$textref = ""; 802 800 803 open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); 804 # open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; 801 if (!open (FILE, $filename)) { 802 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename); 803 die "\n"; 804 } 805 805 806 806 if ($encoding eq "ascii") { … … 868 868 my $plugin_name = ref ($self); 869 869 my $outhandle = $self->{'outhandle'}; 870 &gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);870 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding); 871 871 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but "; 872 872 # print $outhandle "appears to be encoded as $extracted_encoding.\n"; … … 886 886 my ($filename) = @_; 887 887 my $outhandle = $self->{'outhandle'}; 888 888 my $unicode_format = ""; 889 889 # read in file 890 open (FILE, $filename) || ( &gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";890 open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n"; 891 891 undef $/; 892 892 my $text = <FILE>; … … 894 894 close FILE; 895 895 896 # check if first few bytes have a Byte Order Marker 897 my $bom=substr($text,0,2); # check 16bit unicode 898 if ($bom eq "\xff\xfe") { # little endian 16bit unicode 899 $unicode_format="unicode"; 900 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode 901 $unicode_format="unicode"; 902 } else { 903 $bom=substr($text,0,3); # check utf-8 904 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom 905 $unicode_format="utf8"; 906 # } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!? 907 # $unicode_format="utf8"; 908 } 909 } 910 911 896 912 # remove <title>stuff</title> -- as titles tend often to be in English 897 913 # for foreign language documents … … 899 915 900 916 # remove all HTML tags 901 $text =~ s/<[^>]*>//sg; 917 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo) 918 if (ref($self) eq 'HTMLPlug' || 919 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){ 920 $text =~ s/<[^>]*>//sg; 921 } 902 922 903 923 # get the language/encoding … … 907 927 # first one in the list - otherwise use the defaults 908 928 if (scalar @$results > 3) { 909 # changed 12 Feb 2003 by jrm21 910 # use the most popular encoding at least... otherwise we might 911 # generate invalid archive files! 912 my %guessed_encodings = (); 913 foreach my $result (@$results) { 914 $result =~ /([^\-]+)$/; 915 my $enc=$1; 916 if (!defined($guessed_encodings{$enc})) { 917 $guessed_encodings{$enc}=0; 929 my $best_encoding=""; 930 if ($unicode_format) { # in case the first had a BOM 931 $best_encoding=$unicode_format; 932 } else { 933 my %guessed_encodings = (); 934 foreach my $result (@$results) { 935 $result =~ /([^\-]+)$/; 936 my $enc=$1; 937 if (!defined($guessed_encodings{$enc})) { 938 $guessed_encodings{$enc}=0; 939 } 940 $guessed_encodings{$enc}++; 918 941 } 919 $guessed_encodings{$enc}++; 920 }921 my $best_encoding="";922 $guessed_encodings{""}=-1;923 foreach my $enc (keys %guessed_encodings){924 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){925 $best_encoding=$enc;942 943 $guessed_encodings{""}=-1; # for default best_encoding of "" 944 foreach my $enc (keys %guessed_encodings) { 945 if ($guessed_encodings{$enc} > 946 $guessed_encodings{$best_encoding}){ 947 $best_encoding=$enc; 948 } 926 949 } 927 950 } … … 929 952 if ($self->{'input_encoding'} ne 'auto') { 930 953 if ($self->{'extract_language'} && $self->{'verbosity'}) { 931 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});932 # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";933 # print $outhandle "defaulting to $self->{'default_language'}\n";954 gsprintf($outhandle, 955 "BasPlug: {BasPlug.could_not_extract_language}\n", 956 $filename, $self->{'default_language'}); 934 957 } 935 958 return ($self->{'default_language'}, $self->{'input_encoding'}); … … 937 960 } else { 938 961 if ($self->{'verbosity'}) { 939 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});940 # print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";941 # print $outhandle "defaulting to $self->{'default_language'}.\n";962 gsprintf($outhandle, 963 "BasPlug: {BasPlug.could_not_extract_language}\n", 964 $filename, $self->{'default_language'}); 942 965 } 943 966 return ($self->{'default_language'}, $best_encoding); … … 949 972 if (!defined $language) { 950 973 if ($self->{'verbosity'}) { 951 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});952 # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";953 # print $outhandle "defaulting to $self->{'default_language'}\n";974 gsprintf($outhandle, 975 "BasPlug: {BasPlug.could_not_extract_language}\n", 976 $filename, $self->{'default_language'}); 954 977 } 955 978 $language = $self->{'default_language'}; … … 957 980 if (!defined $encoding) { 958 981 if ($self->{'verbosity'}) { 959 &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});960 # print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";961 # print $outhandle "defaulting to $self->{'default_encoding'}\n";982 gsprintf($outhandle, 983 "BasPlug: {BasPlug.could_not_extract_encoding}\n", 984 $filename, $self->{'default_encoding'}); 962 985 } 963 986 $encoding = $self->{'default_encoding'}; … … 1078 1101 my $outhandle = $self->{'outhandle'}; 1079 1102 1080 # print $outhandle " extracting email addresses ...\n" 1081 &gsprintf($outhandle, " {BasPlug.extracting_emails}...\n") 1103 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n") 1082 1104 if ($self->{'verbosity'} > 2); 1083 1105 … … 1090 1112 push @email2, $address; 1091 1113 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); 1092 # print $outhandle " extracting $address\n" 1093 &gsprintf($outhandle, " {BasPlug.extracting} $address\n") 1114 gsprintf($outhandle, " {BasPlug.extracting} $address\n") 1094 1115 if ($self->{'verbosity'} > 3); 1095 1116 } 1096 1117 } 1097 # print $outhandle " done extracting email addresses.\n" 1098 &gsprintf($outhandle, " {BasPlug.done_email_extract}\n") 1118 gsprintf($outhandle, " {BasPlug.done_email_extract}\n") 1099 1119 if ($self->{'verbosity'} > 2); 1100 1120 } … … 1128 1148 my $thissection = $doc_obj->get_top_section(); 1129 1149 my $text = ""; 1130 my @list;1150 my $list; 1131 1151 1132 1152 #loop through sections to gather whole doc … … 1146 1166 if ($list){ 1147 1167 # if a list of kea keyphrases was returned (ie not empty) 1148 &gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n"); 1168 if ($self->{'verbosity'}) { 1169 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n"); 1170 } 1149 1171 1150 1172 #add metadata to top section … … 1216 1238 1217 1239 # print $outhandle " extracting acronyms ...\n" 1218 &gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")1240 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n") 1219 1241 if ($self->{'verbosity'} > 2); 1220 1242 … … 1229 1251 if ($thisAcro eq $acro->to_string()) { 1230 1252 $seen_before = "true"; 1231 # print $outhandle " already seen ". $acro->to_string() . "\n" 1232 &gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n") 1233 if ($self->{'verbosity'} >= 4); 1253 if ($self->{'verbosity'} >= 4) { 1254 gsprintf($outhandle, " {BasPlug.already_seen} " . 1255 $acro->to_string() . "\n"); 1256 } 1234 1257 } 1235 1258 } … … 1241 1264 #do the normal acronym 1242 1265 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string()); 1243 # print $outhandle " adding ". $acro->to_string() . "\n" 1244 &gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n") 1266 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n") 1245 1267 if ($self->{'verbosity'} > 3); 1246 1268 } 1247 1269 } 1248 1270 1249 # print $outhandle " done extracting acronyms. \n" 1250 &gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n") 1271 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n") 1251 1272 if ($self->{'verbosity'} > 2); 1252 1273 } … … 1257 1278 my $outhandle = $self->{'outhandle'}; 1258 1279 1259 # print $outhandle " marking up acronyms ...\n" 1260 &gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n") 1280 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n") 1261 1281 if ($self->{'verbosity'} > 2); 1262 1282 … … 1264 1284 $text = &acronym::markup_acronyms($text, $self); 1265 1285 1266 # print $outhandle " done marking up acronyms. \n" 1267 &gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n") 1286 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n") 1268 1287 if ($self->{'verbosity'} > 2); 1269 1288 … … 1285 1304 my ($doc_obj, $filename) = @_; 1286 1305 1306 my $top_section=$doc_obj->get_top_section(); 1307 1287 1308 $filename =~ s/\.[^\\\/\.]+$/\.jpg/; 1288 1309 if (-e $filename) { 1289 1310 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); 1290 $doc_obj->add_utf8_metadata($t hissection, "hascover", 1);1311 $doc_obj->add_utf8_metadata($top_section, "hascover", 1); 1291 1312 } else { 1292 1313 $filename =~ s/jpg$/JPG/; 1293 1314 if (-e $filename) { 1294 1315 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); 1295 $doc_obj->add_utf8_metadata($t hissection, "hascover", 1);1316 $doc_obj->add_utf8_metadata($top_section, "hascover", 1); 1296 1317 } 1297 1318 }
Note:
See TracChangeset
for help on using the changeset viewer.