Ignore:
Timestamp:
2000-07-13T10:21:53+12:00 (24 years ago)
Author:
sjboddie
Message:

merged changes to trunk into New_Config_Format branch

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/New_Config_Format-branch/gsdl/perllib/plugins/HTMLPlug.pm

    r1020 r1279  
    5050
    5151sub print_usage {
    52     print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
    53 
    5452    print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
    5553    print STDERR "  options:\n";
    56     print STDERR "   -process_exp           A perl regular expression to match against filenames.\n";
    57     print STDERR "                          Matching filenames will be processed by this plugin.\n";
    58     print STDERR "                          Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
    59     print STDERR "                          .htm or .html (case-insensitive).\n";
    6054    print STDERR "   -nolinks               Don't make any attempt to trap links (setting this flag may\n";
    6155    print STDERR "                          improve speed of building/importing but any relative links within\n";
    6256    print STDERR "                          documents will be broken).\n";
    63     print STDERR "   -block_exp             Files matching this regular expression will be blocked from\n";
    64     print STDERR "                          being passed to any further plugins in the list. By default\n";
    65     print STDERR "                          HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";
    66     print STDERR "                          .rtf or .css file extensions.\n";
    6757    print STDERR "   -keep_head             Don't remove headers from html files.\n";
    6858    print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
    6959    print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
    70     print STDERR "                          Defaults to 'Title'\n";
     60    print STDERR "                          Defaults to 'Title'.\n";
     61    print STDERR "                          Use `first200` to get the first 200 characters of the body.\n";
     62    print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
    7163    print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
    72     print STDERR "                          w3mir \n";
    7364    print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
    74     print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)$'\n";
     65    print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n";
    7566    print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images). Also\n";
    7667    print STDERR "                          creates much shallower directory structure (useful when creating\n";
     
    8071sub new {
    8172    my $class = shift (@_);
    82     my $self = new BasPlug ();
     73    my $self = new BasPlug ("HTMLPlug", @_);
    8374
    8475    if (!parsargv::parse(\@_,
    85              q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},
    8676             q^nolinks^, \$self->{'nolinks'},
    87              q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},
    8877             q^keep_head^, \$self->{'keep_head'},
    8978             q^no_metadata^, \$self->{'no_metadata'},
     
    9180             q^w3mir^, \$self->{'w3mir'},
    9281             q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'},
    93              q^rename_assoc_files^, \$self->{'rename_assoc_files'})) {
     82             q^rename_assoc_files^, \$self->{'rename_assoc_files'},
     83             "allow_extra_options")) {
     84
     85    print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
    9486    &print_usage();
    9587    die "\n";
    9688    }
    97 
     89   
    9890    $self->{'aux_files'} = {};
    9991    $self->{'dir_num'} = 0;
    10092    $self->{'file_num'} = 0;
    101 
     93   
    10294    return bless $self, $class;
    10395}
    10496
    105 sub is_recursive {
    106     my $self = shift (@_);
    107 
    108     return 0; # this is not a recursive plugin
    109 }
    110 
    111 # return number of files processed, undef if can't process
    112 # Note that $base_dir might be "" and that $file might
    113 # include directories
    114 sub read {
    115     my $self = shift (@_);
    116     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    117 
    118     my $filename = &util::filename_cat($base_dir, $file);
    119     return 0 if $filename =~ /$self->{'block_exp'}/;
    120     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
    121     return undef;
    122     }
    123     $file =~ s/^[\/\\]+//;
    124 
    125     $self->{'verbosity'} = $processor->{'verbosity'};
     97
     98sub get_default_block_exp {
     99    my $self = shift (@_);
     100
     101    return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^;
     102}
     103
     104sub get_default_process_exp {
     105    my $self = shift (@_);
     106
     107    return q^(?i)\.html?$^;
     108}
     109
     110
     111# do plugin specific processing of doc_obj
     112sub process {
     113    my $self = shift (@_);
     114    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     115
    126116    print STDERR "HTMLPlug: processing $file\n"
    127117    if $self->{'verbosity'} > 1;
    128118
    129     # create a new document
    130     my $doc_obj = new doc ($file, "indexed_doc");
    131119    my $cursection = $doc_obj->get_top_section();
    132    
    133     # read in HTML file
    134     open (FILE, $filename) || die "HTMLPlug::read - can't open $filename\n";
    135     undef $/;
    136     my $text = <FILE>;
    137     $/ = "\n";
    138     close FILE;
    139     if (!defined $text || $text !~ /\w/) {
    140     print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'};
    141     return 0;
    142     }
    143 
    144     $self->extra_metadata ($doc_obj, $cursection, $metadata);
    145     $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection)
     120
     121    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
    146122    unless $self->{'no_metadata'};
    147123
     
    152128    my $web_url = "http://$file";
    153129    $web_url =~ s/\\/\//g; # for windows
    154     $doc_obj->add_metadata($cursection, "URL", $web_url);
     130    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    155131
    156132    # remove header and footer
    157133    if (!$self->{'keep_head'}) {
    158     $text =~ s/^.*?<body[^>]*>//is;
    159     $text =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     134    $$textref =~ s/^.*?<body[^>]*>//is;
     135    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
    160136    }
    161137
     
    164140
    165141    # usemap="./#index" not handled correctly => change to "#index"
    166     $text =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     142    $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    167143        $self->replace_usemap_links($1, $2, $3)/isge;
    168144
    169     $text =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     145    $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*? (?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    170146        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    171147    }
    172148
    173149    # trap images
    174     $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     150    $$textref =~ s/(<img[^>]*? src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    175151    $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    176152
    177     $doc_obj->add_text ($cursection, $text);
    178 
    179     # add an OID
    180     $doc_obj->set_OID();
    181 
    182     # process the document
    183     $processor->process($doc_obj);
    184 
    185     return 1; # processed the file
     153    # add text to document object
     154    $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
     155
     156    return 1;
    186157}
    187158
     
    349320
    350321    foreach my $field (split /,/, $self->{'metadata_fields'}) {
    351    
     322
    352323    # don't need to extract field if it was passed in from a previous
    353324    # (recursive) plugin
     
    361332            my $value = $1;
    362333            $value =~ s/\s+/ /gs;
    363             $doc_obj->add_metadata($section, $field, $value);
     334            $doc_obj->add_utf8_metadata($section, $field, $value);
    364335            next;
    365336        }
     
    367338    }
    368339   
    369     # special case for Title metadata - try <title> tags
    370     # then first 100 characters of text
     340    # TITLE: extract the document title
    371341   
    372342    if ($field =~ /^title$/i) {
     
    378348            if ($title =~ /\w/) {
    379349            $title =~ s/\s+/ /gs;
    380             $doc_obj->add_metadata ($section, $field, $title);
     350            $title =~ s/^\s+//;
     351            $title =~ s/\s+$//;
     352            $doc_obj->add_utf8_metadata ($section, $field, $title);
    381353            next;
    382354            }
     
    386358        # if no title use first 100 characters
    387359        my $tmptext = $$textref;
     360        $tmptext =~ s/\s+/ /gs;
    388361        $tmptext =~ s/<[^>]*>//g;
    389         my $title = substr ($tmptext, 0, 100);
    390         $title =~ s/\s+/ /gs;
    391         $doc_obj->add_metadata ($section, $field, $title);
    392     }
    393     }
    394 }
     362        $tmptext = substr ($tmptext, 0, 100);
     363        $tmptext =~ s/^\s+//;
     364        $tmptext =~ s/\s+$//;
     365        $tmptext =~ s/\s\S*$/.../;
     366        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     367        next;
     368    }
     369
     370    # FIRST200: extract the first 200 characters as metadata
     371
     372    if ($field =~ /^first200$/i) {
     373        my $tmptext = $$textref;
     374        $tmptext =~ s/\s+/ /gs;
     375        $tmptext =~ s/.*<body[^>]*>//i;
     376        $tmptext =~ s/<[^>]*>//g;
     377        $tmptext = substr ($tmptext, 0, 200);
     378        $tmptext =~ s/^\s+//;
     379        $tmptext =~ s/\s+$//;
     380        $tmptext =~ s/\s\S*$/.../;
     381        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     382        next;
     383    }
     384
     385    # H1: extract the text between the first <H1> and </H1> tags
     386    if ($field =~ /^H1$/i) {
     387        my $tmptext = $$textref;
     388        $tmptext =~ s/\s+/ /gs;
     389        if ($tmptext =~ /<H1[^>]*>/i) {
     390        $tmptext =~ s/.*<H1[^>]*>//i;
     391        $tmptext =~ s/<\/H1[^>]*>.*//i;
     392        $tmptext =~ s/^\s+//;
     393        $tmptext =~ s/\s+$//;
     394        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     395        }
     396        next;
     397    }
     398    }
     399}
     400
    395401
    396402# evaluate any "../" to next directory up
Note: See TracChangeset for help on using the changeset viewer.