- Timestamp:
- 2000-07-13T10:21:53+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/New_Config_Format-branch/gsdl/perllib/plugins/HTMLPlug.pm
r1020 r1279 50 50 51 51 sub print_usage { 52 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";53 54 52 print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; 55 53 print STDERR " options:\n"; 56 print STDERR " -process_exp A perl regular expression to match against filenames.\n";57 print STDERR " Matching filenames will be processed by this plugin.\n";58 print STDERR " Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";59 print STDERR " .htm or .html (case-insensitive).\n";60 54 print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n"; 61 55 print STDERR " improve speed of building/importing but any relative links within\n"; 62 56 print STDERR " documents will be broken).\n"; 63 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";64 print STDERR " being passed to any further plugins in the list. By default\n";65 print STDERR " HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";66 print STDERR " .rtf or .css file extensions.\n";67 57 print STDERR " -keep_head Don't remove headers from html files.\n"; 68 58 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n"; 69 59 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n"; 70 print STDERR " Defaults to 'Title'\n"; 60 print STDERR " Defaults to 'Title'.\n"; 61 print STDERR " Use `first200` to get the first 200 characters of the body.\n"; 62 print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n"; 71 63 print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n"; 72 print STDERR " w3mir \n";73 64 print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n"; 74 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf) $'\n";65 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n"; 75 66 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n"; 76 67 print STDERR " creates much shallower directory structure (useful when creating\n"; … … 80 71 sub new { 81 72 my $class = shift (@_); 82 my $self = new BasPlug ( );73 my $self = new BasPlug ("HTMLPlug", @_); 83 74 84 75 if (!parsargv::parse(\@_, 85 q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},86 76 q^nolinks^, \$self->{'nolinks'}, 87 q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},88 77 q^keep_head^, \$self->{'keep_head'}, 89 78 q^no_metadata^, \$self->{'no_metadata'}, … … 91 80 q^w3mir^, \$self->{'w3mir'}, 92 81 q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'}, 93 q^rename_assoc_files^, \$self->{'rename_assoc_files'})) { 82 q^rename_assoc_files^, \$self->{'rename_assoc_files'}, 83 "allow_extra_options")) { 84 85 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n"; 94 86 &print_usage(); 95 87 die "\n"; 96 88 } 97 89 98 90 $self->{'aux_files'} = {}; 99 91 $self->{'dir_num'} = 0; 100 92 $self->{'file_num'} = 0; 101 93 102 94 return bless $self, $class; 103 95 } 104 96 105 sub is_recursive { 106 my $self = shift (@_); 107 108 return 0; # this is not a recursive plugin 109 } 110 111 # return number of files processed, undef if can't process 112 # Note that $base_dir might be "" and that $file might 113 # include directories 114 sub read { 115 my $self = shift (@_); 116 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 117 118 my $filename = &util::filename_cat($base_dir, $file); 119 return 0 if $filename =~ /$self->{'block_exp'}/; 120 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 121 return undef; 122 } 123 $file =~ s/^[\/\\]+//; 124 125 $self->{'verbosity'} = $processor->{'verbosity'}; 97 98 sub get_default_block_exp { 99 my $self = shift (@_); 100 101 return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^; 102 } 103 104 sub get_default_process_exp { 105 my $self = shift (@_); 106 107 return q^(?i)\.html?$^; 108 } 109 110 111 # do plugin specific processing of doc_obj 112 sub process { 113 my $self = shift (@_); 114 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 115 126 116 print STDERR "HTMLPlug: processing $file\n" 127 117 if $self->{'verbosity'} > 1; 128 118 129 # create a new document130 my $doc_obj = new doc ($file, "indexed_doc");131 119 my $cursection = $doc_obj->get_top_section(); 132 133 # read in HTML file 134 open (FILE, $filename) || die "HTMLPlug::read - can't open $filename\n"; 135 undef $/; 136 my $text = <FILE>; 137 $/ = "\n"; 138 close FILE; 139 if (!defined $text || $text !~ /\w/) { 140 print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'}; 141 return 0; 142 } 143 144 $self->extra_metadata ($doc_obj, $cursection, $metadata); 145 $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection) 120 121 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 146 122 unless $self->{'no_metadata'}; 147 123 … … 152 128 my $web_url = "http://$file"; 153 129 $web_url =~ s/\\/\//g; # for windows 154 $doc_obj->add_ metadata($cursection, "URL", $web_url);130 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 155 131 156 132 # remove header and footer 157 133 if (!$self->{'keep_head'}) { 158 $ text=~ s/^.*?<body[^>]*>//is;159 $ text=~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;134 $$textref =~ s/^.*?<body[^>]*>//is; 135 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 160 136 } 161 137 … … 164 140 165 141 # usemap="./#index" not handled correctly => change to "#index" 166 $ text=~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/142 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 167 143 $self->replace_usemap_links($1, $2, $3)/isge; 168 144 169 $ text =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/145 $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*? (?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 170 146 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 171 147 } 172 148 173 149 # trap images 174 $ text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/150 $$textref =~ s/(<img[^>]*? src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 175 151 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 176 152 177 $doc_obj->add_text ($cursection, $text); 178 179 # add an OID 180 $doc_obj->set_OID(); 181 182 # process the document 183 $processor->process($doc_obj); 184 185 return 1; # processed the file 153 # add text to document object 154 $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>"); 155 156 return 1; 186 157 } 187 158 … … 349 320 350 321 foreach my $field (split /,/, $self->{'metadata_fields'}) { 351 322 352 323 # don't need to extract field if it was passed in from a previous 353 324 # (recursive) plugin … … 361 332 my $value = $1; 362 333 $value =~ s/\s+/ /gs; 363 $doc_obj->add_ metadata($section, $field, $value);334 $doc_obj->add_utf8_metadata($section, $field, $value); 364 335 next; 365 336 } … … 367 338 } 368 339 369 # special case for Title metadata - try <title> tags 370 # then first 100 characters of text 340 # TITLE: extract the document title 371 341 372 342 if ($field =~ /^title$/i) { … … 378 348 if ($title =~ /\w/) { 379 349 $title =~ s/\s+/ /gs; 380 $doc_obj->add_metadata ($section, $field, $title); 350 $title =~ s/^\s+//; 351 $title =~ s/\s+$//; 352 $doc_obj->add_utf8_metadata ($section, $field, $title); 381 353 next; 382 354 } … … 386 358 # if no title use first 100 characters 387 359 my $tmptext = $$textref; 360 $tmptext =~ s/\s+/ /gs; 388 361 $tmptext =~ s/<[^>]*>//g; 389 my $title = substr ($tmptext, 0, 100); 390 $title =~ s/\s+/ /gs; 391 $doc_obj->add_metadata ($section, $field, $title); 392 } 393 } 394 } 362 $tmptext = substr ($tmptext, 0, 100); 363 $tmptext =~ s/^\s+//; 364 $tmptext =~ s/\s+$//; 365 $tmptext =~ s/\s\S*$/.../; 366 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 367 next; 368 } 369 370 # FIRST200: extract the first 200 characters as metadata 371 372 if ($field =~ /^first200$/i) { 373 my $tmptext = $$textref; 374 $tmptext =~ s/\s+/ /gs; 375 $tmptext =~ s/.*<body[^>]*>//i; 376 $tmptext =~ s/<[^>]*>//g; 377 $tmptext = substr ($tmptext, 0, 200); 378 $tmptext =~ s/^\s+//; 379 $tmptext =~ s/\s+$//; 380 $tmptext =~ s/\s\S*$/.../; 381 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 382 next; 383 } 384 385 # H1: extract the text between the first <H1> and </H1> tags 386 if ($field =~ /^H1$/i) { 387 my $tmptext = $$textref; 388 $tmptext =~ s/\s+/ /gs; 389 if ($tmptext =~ /<H1[^>]*>/i) { 390 $tmptext =~ s/.*<H1[^>]*>//i; 391 $tmptext =~ s/<\/H1[^>]*>.*//i; 392 $tmptext =~ s/^\s+//; 393 $tmptext =~ s/\s+$//; 394 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 395 } 396 next; 397 } 398 } 399 } 400 395 401 396 402 # evaluate any "../" to next directory up
Note:
See TracChangeset
for help on using the changeset viewer.