Changeset 1243 for trunk/gsdl


Ignore:
Timestamp:
2000-06-27T09:38:51+12:00 (24 years ago)
Author:
sjboddie
Message:

Caught HTMLPlug up with BasPlug. A few minor changes to some supporting
files (for new BasPlug options).

Location:
trunk/gsdl
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/import.pl

    r1031 r1243  
    135135
    136136    # load all the plugins
    137     $pluginfo = &plugin::load_plugins ($plugins);
     137    $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
    138138    if (scalar(@$pluginfo) == 0) {
    139139    print STDERR "No plugins were loaded.\n";
  • trunk/gsdl/perllib/mgbuilder.pm

    r1072 r1243  
    9999   
    100100    # load all the plugins
    101     $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
     101    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity);
    102102    if (scalar(@{$self->{'pluginfo'}}) == 0) {
    103103    print STDERR "No plugins were loaded.\n";
  • trunk/gsdl/perllib/plugin.pm

    r835 r1243  
    2929
    3030sub load_plugins {
    31     my ($plugin_list) = @_;
     31    my ($plugin_list, $verbosity) = @_;
    3232    my @plugin_objects = ();
     33
     34    $verbosity = 2 unless defined $verbosity;
    3335
    3436    foreach $pluginoptions (@$plugin_list) {
     
    5254    die "$@" if $@;
    5355   
     56    # initialize plugin
     57    $plugobj->init($verbosity);
     58
    5459    # add this object to the list
    5560    push (@plugin_objects, $plugobj);
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r1231 r1243  
    5454    print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
    5555    print STDERR "  options:\n";
    56     print STDERR "   -process_exp           A perl regular expression to match against filenames.\n";
    57     print STDERR "                          Matching filenames will be processed by this plugin.\n";
    58     print STDERR "                          Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
    59     print STDERR "                          .htm or .html (case-insensitive).\n";
    6056    print STDERR "   -nolinks               Don't make any attempt to trap links (setting this flag may\n";
    6157    print STDERR "                          improve speed of building/importing but any relative links within\n";
    6258    print STDERR "                          documents will be broken).\n";
    63     print STDERR "   -block_exp             Files matching this regular expression will be blocked from\n";
    64     print STDERR "                          being passed to any further plugins in the list. By default\n";
    65     print STDERR "                          HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";
    66     print STDERR "                          .rtf or .css file extensions.\n";
    6759    print STDERR "   -keep_head             Don't remove headers from html files.\n";
    6860    print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
     
    8274sub new {
    8375    my $class = shift (@_);
    84     my $self = new BasPlug (@_);
     76    my $self = new BasPlug ("HTMLPlug", @_);
    8577
    8678    if (!parsargv::parse(\@_,
    87              q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},
    8879             q^nolinks^, \$self->{'nolinks'},
    89              q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},
    9080             q^keep_head^, \$self->{'keep_head'},
    9181             q^no_metadata^, \$self->{'no_metadata'},
     
    9585             q^rename_assoc_files^, \$self->{'rename_assoc_files'},
    9686             "allow_extra_options")) {
    97 
     87   
    9888    &print_usage();
    9989    die "\n";
    10090    }
    101 
     91   
    10292    $self->{'aux_files'} = {};
    10393    $self->{'dir_num'} = 0;
    10494    $self->{'file_num'} = 0;
    105 
     95   
    10696    return bless $self, $class;
    10797}
    10898
    109 sub is_recursive {
    110     my $self = shift (@_);
    111 
    112     return 0; # this is not a recursive plugin
    113 }
    114 
    115 # return number of files processed, undef if can't process
    116 # Note that $base_dir might be "" and that $file might
    117 # include directories
    118 sub read {
    119     my $self = shift (@_);
    120     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    121 
    122     my $filename = &util::filename_cat($base_dir, $file);
    123     return 0 if $filename =~ /$self->{'block_exp'}/;
    124     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
    125     return undef;
    126     }
    127     $file =~ s/^[\/\\]+//;
    128 
    129     $self->{'verbosity'} = $processor->{'verbosity'};
     99
     100sub get_default_block_exp {
     101    my $self = shift (@_);
     102
     103    return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^;
     104}
     105
     106sub get_default_process_exp {
     107    my $self = shift (@_);
     108
     109    return q^(?i)\.html?$^;
     110}
     111
     112
     113# do plugin specific processing of doc_obj
     114sub process {
     115    my $self = shift (@_);
     116    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     117
    130118    print STDERR "HTMLPlug: processing $file\n"
    131119    if $self->{'verbosity'} > 1;
    132120
    133     # create a new document
    134     my $doc_obj = new doc ($file, "indexed_doc");
    135121    my $cursection = $doc_obj->get_top_section();
    136    
    137     # read in HTML file ($text will be in utf8)
    138     my $text = "";
    139     $self->read_file ($filename, \$text);
    140 
    141     if ($text !~ /\w/) {
    142     print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'};
    143     return 0;
    144     }
    145 
    146     $self->extra_metadata ($doc_obj, $cursection, $metadata);
    147     $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection)
     122
     123    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
    148124    unless $self->{'no_metadata'};
    149125
     
    158134    # remove header and footer
    159135    if (!$self->{'keep_head'}) {
    160     $text =~ s/^.*?<body[^>]*>//is;
    161     $text =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     136    $$textref =~ s/^.*?<body[^>]*>//is;
     137    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
    162138    }
    163139
     
    166142
    167143    # usemap="./#index" not handled correctly => change to "#index"
    168     $text =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     144    $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    169145        $self->replace_usemap_links($1, $2, $3)/isge;
    170146
    171     $text =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     147    $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    172148        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    173149    }
    174150
    175151    # trap images
    176     $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     152    $$textref =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    177153    $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    178 
    179     $doc_obj->add_utf8_text ($cursection, $text);
    180 
    181     # add an OID
    182     $doc_obj->set_OID();
    183 
    184     # process the document
    185     $processor->process($doc_obj);
    186 
    187     return 1; # processed the file
    188154}
    189155
Note: See TracChangeset for help on using the changeset viewer.