Ignore:
Timestamp:
2008-06-05T09:29:32+12:00 (16 years ago)
Author:
kjdon
Message:

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/HTMLPlugin.pm

    r15865 r15872  
    11###########################################################################
    22#
    3 # HTMLPlug.pm -- basic html plugin
     3# HTMLPlugin.pm -- basic html plugin
    44#
    55# A component of the Greenstone digital library software
     
    3434#
    3535
    36 package HTMLPlug;
    37 
    38 use BasPlug;
     36package HTMLPlugin;
     37
     38use ReadTextFile;
     39use HBPlugin;
    3940use ghtml;
    4041use unicode;
     
    4647
    4748sub BEGIN {
    48     @HTMLPlug::ISA = ('BasPlug');
     49    @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
    4950}
    5051
     
    5455my $arguments =
    5556    [ { 'name' => "process_exp",
    56     'desc' => "{BasPlug.process_exp}",
     57    'desc' => "{BasePlugin.process_exp}",
    5758    'type' => "regexp",
    5859    'deft' =>  &get_default_process_exp() },
    5960      { 'name' => "block_exp",
    60     'desc' => "{BasPlug.block_exp}",
     61    'desc' => "{BasePlugin.block_exp}",
    6162    'type' => 'regexp',
    6263    'deft' =>  &get_default_block_exp() },
    6364      { 'name' => "nolinks",
    64     'desc' => "{HTMLPlug.nolinks}",
     65    'desc' => "{HTMLPlugin.nolinks}",
    6566    'type' => "flag" },
    6667      { 'name' => "keep_head",
    67     'desc' => "{HTMLPlug.keep_head}",
     68    'desc' => "{HTMLPlugin.keep_head}",
    6869    'type' => "flag" },
    6970      { 'name' => "no_metadata",
    70     'desc' => "{HTMLPlug.no_metadata}",
     71    'desc' => "{HTMLPlugin.no_metadata}",
    7172    'type' => "flag" },
    7273      { 'name' => "metadata_fields",
    73     'desc' => "{HTMLPlug.metadata_fields}",
     74    'desc' => "{HTMLPlugin.metadata_fields}",
    7475    'type' => "string",
    7576    'deft' => "Title" },
    7677      { 'name' => "hunt_creator_metadata",
    77     'desc' => "{HTMLPlug.hunt_creator_metadata}",
     78    'desc' => "{HTMLPlugin.hunt_creator_metadata}",
    7879    'type' => "flag" },
    7980      { 'name' => "file_is_url",
    80     'desc' => "{HTMLPlug.file_is_url}",
     81    'desc' => "{HTMLPlugin.file_is_url}",
    8182    'type' => "flag" },
    8283      { 'name' => "assoc_files",
    83     'desc' => "{HTMLPlug.assoc_files}",
     84    'desc' => "{HTMLPlugin.assoc_files}",
    8485    'type' => "regexp",
    8586    'deft' => &get_default_block_exp() },
    8687      { 'name' => "rename_assoc_files",
    87     'desc' => "{HTMLPlug.rename_assoc_files}",
     88    'desc' => "{HTMLPlugin.rename_assoc_files}",
    8889    'type' => "flag" },
    8990      { 'name' => "title_sub",
    90     'desc' => "{HTMLPlug.title_sub}",
     91    'desc' => "{HTMLPlugin.title_sub}",
    9192    'type' => "string",
    9293    'deft' => "" },
    9394      { 'name' => "description_tags",
    94     'desc' => "{HTMLPlug.description_tags}",
     95    'desc' => "{HTMLPlugin.description_tags}",
    9596    'type' => "flag" },
    9697      # retain this for backward compatibility (w3mir option was replaced by
    9798      # file_is_url)
    9899      { 'name' => "w3mir",
    99 #   'desc' => "{HTMLPlug.w3mir}",
     100#   'desc' => "{HTMLPlugin.w3mir}",
    100101    'type' => "flag",
    101102    'hiddengli' => "yes"},
    102103      { 'name' => "no_strip_metadata_html",
    103     'desc' => "{HTMLPlug.no_strip_metadata_html}",
     104    'desc' => "{HTMLPlugin.no_strip_metadata_html}",
    104105    'type' => "string",
    105106    'deft' => "",
    106107    'reqd' => "no"},
    107108      { 'name' => "sectionalise_using_h_tags",
    108     'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
     109    'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
    109110    'type' => "flag" },
    110111      { 'name' => "use_realistic_book",
    111         'desc' => "{HTMLPlug.tidy_html}",
     112        'desc' => "{HTMLPlugin.tidy_html}",
    112113    'type' => "flag"},
    113       { 'name' => "is_old_HDL_tags",
    114         'desc' => "{HTMLPlug.old_style_HDL}",
    115     'type' => "flag"},
    116       { 'name' => "no_image_links",            # in future think about removing this option,
    117         'desc' => "{HTMLPlug.no_image_links}", # since it has become the default behaviour
    118     'type' => "flag"}, 
     114      { 'name' => "old_style_HDL",
     115        'desc' => "{HTMLPlugin.old_style_HDL}",
     116    'type' => "flag"}
    119117      ];
    120118
    121 my $options = { 'name'     => "HTMLPlug",
    122         'desc'     => "{HTMLPlug.desc}",
     119my $options = { 'name'     => "HTMLPlugin",
     120        'desc'     => "{HTMLPlugin.desc}",
    123121        'abstract' => "no",
    124122        'inherits' => "yes",
     
    506504    if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'}))
    507505    {
    508         # because the document has to be sectionalized set the description tags
    509         $self->{'description_tags'} = 1;
    510 
    511         # set the file to be tidied
    512             $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
    513        
    514             # get the tidied file
    515             #my $tidy_filename = $self->tmp_tidy_file($input_filename);
    516         my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
    517        
    518             # derive tmp filename from input filename
    519             my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
     506    # because the document has to be sectionalized set the description tags
     507    $self->{'description_tags'} = 1;
    520508   
    521         # set the new input file and base_dir to be from the tidied file
    522         $file = "$tailname$suffix";
    523         $base_dir = $dirname;
     509    # set the file to be tidied
     510    $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
     511   
     512    # get the tidied file
     513    #my $tidy_filename = $self->tmp_tidy_file($input_filename);
     514    my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
     515   
     516    # derive tmp filename from input filename
     517    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
     518   
     519    # set the new input file and base_dir to be from the tidied file
     520    $file = "$tailname$suffix";
     521    $base_dir = $dirname;
    524522    }
    525523   
    526524    # call the parent read_into_doc_obj
    527     my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
     525    my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
    528526   
    529527    return ($process_status,$doc_obj);
     
    535533    push(@$pluginlist, $class);
    536534   
    537     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    538     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
     535    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     536    push(@{$hashArgOptLists->{"OptList"}},$options);
    539537   
    540538
    541     my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
     539    my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
    542540   
    543541    if ($self->{'w3mir'}) {
     
    618616    my $outhandle = $self->{'outhandle'};
    619617
    620     print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
    621 
    622     print $outhandle "HTMLPlug: processing $file\n"
     618    print STDERR "<Processing n='$file' p='HTMLPlugin'>\n" if ($gli);
     619
     620    print $outhandle "HTMLPlugin: processing $file\n"
    623621    if $self->{'verbosity'} > 1;
    624622
     
    669667    # URL metadata (even invalid ones) are used to support internal
    670668    # links, so even if 'file_is_url' is off, still need to store info
    671    
    672     $file = &BasPlug::filename_to_metadata($self, $file); # ensures filename is in UTF8 character encoding
    673     my $web_url = "http://$file";
    674     $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); # will eventually ensure it is utf8 anyway
     669
     670    my $utf8_file = $self->filename_to_utf8_metadata($file);
     671    my $web_url = "http://$utf8_file";
     672    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    675673
    676674    if ($self->{'file_is_url'}) {
     
    752750    }
    753751    if ($cursection ne "") {
    754         print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
     752        print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
    755753    }
    756754
     
    760758        if (!$found_something) {
    761759        if ($self->{'verbosity'} > 2) {
    762             print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
     760            print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
    763761            print $outhandle "          will be processed as a single section document\n";
    764762        }
     
    775773
    776774        } else {
    777         print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
     775        print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
    778776        print $outhandle "          of the final closing </Section> tag. This text will\n";
    779777        print $outhandle "          be ignored.";
     
    795793        # been processed already but we should print the warning
    796794        # as above and extract metadata
    797         print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
     795        print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
    798796        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
    799797        }
     
    892890    # trap images
    893891
    894     # Previously, by default, HTMLPlug would embed <img> tags inside anchor tags
     892    # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
    895893    # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
    896894    # turned regular text succeeding images into links. That is, by embedding <imgs>
     
    907905
    908906    # If at any time, there is a need for having images embedded in <a> anchor tags,
    909     # then it might be better to turn that into an HTMLPlug option rather than make
     907    # then it might be better to turn that into an HTMLPlugin option rather than make
    910908    # it the default behaviour. Also, eventually, no_image_links needs to become
    911     # a deprecated option for HTMLPlug as it has now become the default behaviour.
     909    # a deprecated option for HTMLPlugin as it has now become the default behaviour.
    912910
    913911    #if(!$self->{'no_image_links'}){
    914912    $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"][^\"]+[\"]|[\'][^\']+[\']|[^\s\/>]+)([^>]*>)/
    915         $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
     913    $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    916914    #}
    917915
     
    936934    $back="\"$back";
    937935    }
     936
    938937    $link =~ s/\n/ /g;
    939938
     
    10741073
    10751074    my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
    1076 
     1075   
    10771076    $hash_part = "" if !defined $hash_part;
    10781077    if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
    10791078    my $outhandle = $self->{'outhandle'};
    1080     print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
     1079    print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
    10811080        if $self->{'verbosity'};
    10821081    return ($link, "", 0);
     
    12571256
    12581257    if (!defined $tag) {
    1259         print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
     1258        print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
    12601259        next;
    12611260    }
     
    12741273    }
    12751274    if (!defined $value) {
    1276         print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
     1275        print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
    12771276        next;
    12781277    }
     
    14251424
    14261425
    1427 # Extend the BasPlug read_file so that strings like &eacute; are
     1426# Extend read_file so that strings like &eacute; are
    14281427# converted to UTF8 internally. 
    14291428#
     
    14321431
    14331432sub read_file {
    1434     my ($self, $filename, $encoding, $language, $textref) = @_;
    1435 
    1436     &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
     1433    my $self = shift(@_);
     1434    my ($filename, $encoding, $language, $textref) = @_;
     1435
     1436    $self->SUPER::read_file($filename, $encoding, $language, $textref);
    14371437
    14381438    # Convert entities to their UTF8 equivalents
Note: See TracChangeset for help on using the changeset viewer.