Changeset 15872 for gsdl/trunk/perllib/plugins/HTMLPlugin.pm
- Timestamp:
- 2008-06-05T09:29:32+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/HTMLPlugin.pm
r15865 r15872 1 1 ########################################################################### 2 2 # 3 # HTMLPlug .pm -- basic html plugin3 # HTMLPlugin.pm -- basic html plugin 4 4 # 5 5 # A component of the Greenstone digital library software … … 34 34 # 35 35 36 package HTMLPlug; 37 38 use BasPlug; 36 package HTMLPlugin; 37 38 use ReadTextFile; 39 use HBPlugin; 39 40 use ghtml; 40 41 use unicode; … … 46 47 47 48 sub BEGIN { 48 @HTMLPlug ::ISA = ('BasPlug');49 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin'); 49 50 } 50 51 … … 54 55 my $arguments = 55 56 [ { 'name' => "process_exp", 56 'desc' => "{Bas Plug.process_exp}",57 'desc' => "{BasePlugin.process_exp}", 57 58 'type' => "regexp", 58 59 'deft' => &get_default_process_exp() }, 59 60 { 'name' => "block_exp", 60 'desc' => "{Bas Plug.block_exp}",61 'desc' => "{BasePlugin.block_exp}", 61 62 'type' => 'regexp', 62 63 'deft' => &get_default_block_exp() }, 63 64 { 'name' => "nolinks", 64 'desc' => "{HTMLPlug .nolinks}",65 'desc' => "{HTMLPlugin.nolinks}", 65 66 'type' => "flag" }, 66 67 { 'name' => "keep_head", 67 'desc' => "{HTMLPlug .keep_head}",68 'desc' => "{HTMLPlugin.keep_head}", 68 69 'type' => "flag" }, 69 70 { 'name' => "no_metadata", 70 'desc' => "{HTMLPlug .no_metadata}",71 'desc' => "{HTMLPlugin.no_metadata}", 71 72 'type' => "flag" }, 72 73 { 'name' => "metadata_fields", 73 'desc' => "{HTMLPlug .metadata_fields}",74 'desc' => "{HTMLPlugin.metadata_fields}", 74 75 'type' => "string", 75 76 'deft' => "Title" }, 76 77 { 'name' => "hunt_creator_metadata", 77 'desc' => "{HTMLPlug .hunt_creator_metadata}",78 'desc' => "{HTMLPlugin.hunt_creator_metadata}", 78 79 'type' => "flag" }, 79 80 { 'name' => "file_is_url", 80 'desc' => "{HTMLPlug .file_is_url}",81 'desc' => "{HTMLPlugin.file_is_url}", 81 82 'type' => "flag" }, 82 83 { 'name' => "assoc_files", 83 'desc' => "{HTMLPlug .assoc_files}",84 'desc' => "{HTMLPlugin.assoc_files}", 84 85 'type' => "regexp", 85 86 'deft' => &get_default_block_exp() }, 86 87 { 'name' => "rename_assoc_files", 87 'desc' => "{HTMLPlug .rename_assoc_files}",88 'desc' => "{HTMLPlugin.rename_assoc_files}", 88 89 'type' => "flag" }, 89 90 { 'name' => "title_sub", 90 'desc' => "{HTMLPlug .title_sub}",91 'desc' => "{HTMLPlugin.title_sub}", 91 92 'type' => "string", 92 93 'deft' => "" }, 93 94 { 'name' => "description_tags", 94 'desc' => "{HTMLPlug .description_tags}",95 'desc' => "{HTMLPlugin.description_tags}", 95 96 'type' => "flag" }, 96 97 # retain this for backward compatibility (w3mir option was replaced by 97 98 # file_is_url) 98 99 { 'name' => "w3mir", 99 # 'desc' => "{HTMLPlug .w3mir}",100 # 'desc' => "{HTMLPlugin.w3mir}", 100 101 'type' => "flag", 101 102 'hiddengli' => "yes"}, 102 103 { 'name' => "no_strip_metadata_html", 103 'desc' => "{HTMLPlug .no_strip_metadata_html}",104 'desc' => "{HTMLPlugin.no_strip_metadata_html}", 104 105 'type' => "string", 105 106 'deft' => "", 106 107 'reqd' => "no"}, 107 108 { 'name' => "sectionalise_using_h_tags", 108 'desc' => "{HTMLPlug .sectionalise_using_h_tags}",109 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}", 109 110 'type' => "flag" }, 110 111 { 'name' => "use_realistic_book", 111 'desc' => "{HTMLPlug .tidy_html}",112 'desc' => "{HTMLPlugin.tidy_html}", 112 113 'type' => "flag"}, 113 { 'name' => "is_old_HDL_tags", 114 'desc' => "{HTMLPlug.old_style_HDL}", 115 'type' => "flag"}, 116 { 'name' => "no_image_links", # in future think about removing this option, 117 'desc' => "{HTMLPlug.no_image_links}", # since it has become the default behaviour 118 'type' => "flag"}, 114 { 'name' => "old_style_HDL", 115 'desc' => "{HTMLPlugin.old_style_HDL}", 116 'type' => "flag"} 119 117 ]; 120 118 121 my $options = { 'name' => "HTMLPlug ",122 'desc' => "{HTMLPlug .desc}",119 my $options = { 'name' => "HTMLPlugin", 120 'desc' => "{HTMLPlugin.desc}", 123 121 'abstract' => "no", 124 122 'inherits' => "yes", … … 506 504 if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'})) 507 505 { 508 # because the document has to be sectionalized set the description tags 509 $self->{'description_tags'} = 1; 510 511 # set the file to be tidied 512 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/; 513 514 # get the tidied file 515 #my $tidy_filename = $self->tmp_tidy_file($input_filename); 516 my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename); 517 518 # derive tmp filename from input filename 519 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$"); 506 # because the document has to be sectionalized set the description tags 507 $self->{'description_tags'} = 1; 520 508 521 # set the new input file and base_dir to be from the tidied file 522 $file = "$tailname$suffix"; 523 $base_dir = $dirname; 509 # set the file to be tidied 510 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/; 511 512 # get the tidied file 513 #my $tidy_filename = $self->tmp_tidy_file($input_filename); 514 my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename); 515 516 # derive tmp filename from input filename 517 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$"); 518 519 # set the new input file and base_dir to be from the tidied file 520 $file = "$tailname$suffix"; 521 $base_dir = $dirname; 524 522 } 525 523 526 524 # call the parent read_into_doc_obj 527 my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);525 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli); 528 526 529 527 return ($process_status,$doc_obj); … … 535 533 push(@$pluginlist, $class); 536 534 537 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}538 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};535 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 536 push(@{$hashArgOptLists->{"OptList"}},$options); 539 537 540 538 541 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);539 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists); 542 540 543 541 if ($self->{'w3mir'}) { … … 618 616 my $outhandle = $self->{'outhandle'}; 619 617 620 print STDERR "<Processing n='$file' p='HTMLPlug '>\n" if ($gli);621 622 print $outhandle "HTMLPlug : processing $file\n"618 print STDERR "<Processing n='$file' p='HTMLPlugin'>\n" if ($gli); 619 620 print $outhandle "HTMLPlugin: processing $file\n" 623 621 if $self->{'verbosity'} > 1; 624 622 … … 669 667 # URL metadata (even invalid ones) are used to support internal 670 668 # links, so even if 'file_is_url' is off, still need to store info 671 672 $file = &BasPlug::filename_to_metadata($self, $file); # ensures filename is in UTF8 character encoding673 my $web_url = "http://$ file";674 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); # will eventually ensure it is utf8 anyway669 670 my $utf8_file = $self->filename_to_utf8_metadata($file); 671 my $web_url = "http://$utf8_file"; 672 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 675 673 676 674 if ($self->{'file_is_url'}) { … … 752 750 } 753 751 if ($cursection ne "") { 754 print $outhandle "HTMLPlug : WARNING: $file contains unmatched <Section></Section> tags\n";752 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n"; 755 753 } 756 754 … … 760 758 if (!$found_something) { 761 759 if ($self->{'verbosity'} > 2) { 762 print $outhandle "HTMLPlug : WARNING: $file appears to contain no Section tags so\n";760 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n"; 763 761 print $outhandle " will be processed as a single section document\n"; 764 762 } … … 775 773 776 774 } else { 777 print $outhandle "HTMLPlug : WARNING: $file contains the following text outside\n";775 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n"; 778 776 print $outhandle " of the final closing </Section> tag. This text will\n"; 779 777 print $outhandle " be ignored."; … … 795 793 # been processed already but we should print the warning 796 794 # as above and extract metadata 797 print $outhandle "HTMLPlug : WARNING: $file appears to contain no Section tags and\n";795 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n"; 798 796 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 799 797 } … … 892 890 # trap images 893 891 894 # Previously, by default, HTMLPlug would embed <img> tags inside anchor tags892 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags 895 893 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that 896 894 # turned regular text succeeding images into links. That is, by embedding <imgs> … … 907 905 908 906 # If at any time, there is a need for having images embedded in <a> anchor tags, 909 # then it might be better to turn that into an HTMLPlug option rather than make907 # then it might be better to turn that into an HTMLPlugin option rather than make 910 908 # it the default behaviour. Also, eventually, no_image_links needs to become 911 # a deprecated option for HTMLPlug as it has now become the default behaviour.909 # a deprecated option for HTMLPlugin as it has now become the default behaviour. 912 910 913 911 #if(!$self->{'no_image_links'}){ 914 912 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"][^\"]+[\"]|[\'][^\']+[\']|[^\s\/>]+)([^>]*>)/ 915 913 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 916 914 #} 917 915 … … 936 934 $back="\"$back"; 937 935 } 936 938 937 $link =~ s/\n/ /g; 939 938 … … 1074 1073 1075 1074 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/; 1076 1075 1077 1076 $hash_part = "" if !defined $hash_part; 1078 1077 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) { 1079 1078 my $outhandle = $self->{'outhandle'}; 1080 print $outhandle "HTMLPlug : ERROR - badly formatted tag ignored ($link)\n"1079 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n" 1081 1080 if $self->{'verbosity'}; 1082 1081 return ($link, "", 0); … … 1257 1256 1258 1257 if (!defined $tag) { 1259 print $outhandle "HTMLPlug : can't find NAME in \"$metatag\"\n";1258 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n"; 1260 1259 next; 1261 1260 } … … 1274 1273 } 1275 1274 if (!defined $value) { 1276 print $outhandle "HTMLPlug : can't find VALUE in \"$metatag\"\n";1275 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n"; 1277 1276 next; 1278 1277 } … … 1425 1424 1426 1425 1427 # Extend the BasPlugread_file so that strings like é are1426 # Extend read_file so that strings like é are 1428 1427 # converted to UTF8 internally. 1429 1428 # … … 1432 1431 1433 1432 sub read_file { 1434 my ($self, $filename, $encoding, $language, $textref) = @_; 1435 1436 &BasPlug::read_file($self, $filename, $encoding, $language, $textref); 1433 my $self = shift(@_); 1434 my ($filename, $encoding, $language, $textref) = @_; 1435 1436 $self->SUPER::read_file($filename, $encoding, $language, $textref); 1437 1437 1438 1438 # Convert entities to their UTF8 equivalents
Note:
See TracChangeset
for help on using the changeset viewer.