Changeset 1243 for trunk/gsdl
- Timestamp:
- 2000-06-27T09:38:51+12:00 (24 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/import.pl
r1031 r1243 135 135 136 136 # load all the plugins 137 $pluginfo = &plugin::load_plugins ($plugins );137 $pluginfo = &plugin::load_plugins ($plugins, $verbosity); 138 138 if (scalar(@$pluginfo) == 0) { 139 139 print STDERR "No plugins were loaded.\n"; -
trunk/gsdl/perllib/mgbuilder.pm
r1072 r1243 99 99 100 100 # load all the plugins 101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins );101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity); 102 102 if (scalar(@{$self->{'pluginfo'}}) == 0) { 103 103 print STDERR "No plugins were loaded.\n"; -
trunk/gsdl/perllib/plugin.pm
r835 r1243 29 29 30 30 sub load_plugins { 31 my ($plugin_list ) = @_;31 my ($plugin_list, $verbosity) = @_; 32 32 my @plugin_objects = (); 33 34 $verbosity = 2 unless defined $verbosity; 33 35 34 36 foreach $pluginoptions (@$plugin_list) { … … 52 54 die "$@" if $@; 53 55 56 # initialize plugin 57 $plugobj->init($verbosity); 58 54 59 # add this object to the list 55 60 push (@plugin_objects, $plugobj); -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r1231 r1243 54 54 print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; 55 55 print STDERR " options:\n"; 56 print STDERR " -process_exp A perl regular expression to match against filenames.\n";57 print STDERR " Matching filenames will be processed by this plugin.\n";58 print STDERR " Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";59 print STDERR " .htm or .html (case-insensitive).\n";60 56 print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n"; 61 57 print STDERR " improve speed of building/importing but any relative links within\n"; 62 58 print STDERR " documents will be broken).\n"; 63 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";64 print STDERR " being passed to any further plugins in the list. By default\n";65 print STDERR " HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";66 print STDERR " .rtf or .css file extensions.\n";67 59 print STDERR " -keep_head Don't remove headers from html files.\n"; 68 60 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n"; … … 82 74 sub new { 83 75 my $class = shift (@_); 84 my $self = new BasPlug ( @_);76 my $self = new BasPlug ("HTMLPlug", @_); 85 77 86 78 if (!parsargv::parse(\@_, 87 q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},88 79 q^nolinks^, \$self->{'nolinks'}, 89 q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},90 80 q^keep_head^, \$self->{'keep_head'}, 91 81 q^no_metadata^, \$self->{'no_metadata'}, … … 95 85 q^rename_assoc_files^, \$self->{'rename_assoc_files'}, 96 86 "allow_extra_options")) { 97 87 98 88 &print_usage(); 99 89 die "\n"; 100 90 } 101 91 102 92 $self->{'aux_files'} = {}; 103 93 $self->{'dir_num'} = 0; 104 94 $self->{'file_num'} = 0; 105 95 106 96 return bless $self, $class; 107 97 } 108 98 109 sub is_recursive { 110 my $self = shift (@_); 111 112 return 0; # this is not a recursive plugin 113 } 114 115 # return number of files processed, undef if can't process 116 # Note that $base_dir might be "" and that $file might 117 # include directories 118 sub read { 119 my $self = shift (@_); 120 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 121 122 my $filename = &util::filename_cat($base_dir, $file); 123 return 0 if $filename =~ /$self->{'block_exp'}/; 124 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 125 return undef; 126 } 127 $file =~ s/^[\/\\]+//; 128 129 $self->{'verbosity'} = $processor->{'verbosity'}; 99 100 sub get_default_block_exp { 101 my $self = shift (@_); 102 103 return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^; 104 } 105 106 sub get_default_process_exp { 107 my $self = shift (@_); 108 109 return q^(?i)\.html?$^; 110 } 111 112 113 # do plugin specific processing of doc_obj 114 sub process { 115 my $self = shift (@_); 116 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 117 130 118 print STDERR "HTMLPlug: processing $file\n" 131 119 if $self->{'verbosity'} > 1; 132 120 133 # create a new document134 my $doc_obj = new doc ($file, "indexed_doc");135 121 my $cursection = $doc_obj->get_top_section(); 136 137 # read in HTML file ($text will be in utf8) 138 my $text = ""; 139 $self->read_file ($filename, \$text); 140 141 if ($text !~ /\w/) { 142 print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'}; 143 return 0; 144 } 145 146 $self->extra_metadata ($doc_obj, $cursection, $metadata); 147 $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection) 122 123 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 148 124 unless $self->{'no_metadata'}; 149 125 … … 158 134 # remove header and footer 159 135 if (!$self->{'keep_head'}) { 160 $ text=~ s/^.*?<body[^>]*>//is;161 $ text=~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;136 $$textref =~ s/^.*?<body[^>]*>//is; 137 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 162 138 } 163 139 … … 166 142 167 143 # usemap="./#index" not handled correctly => change to "#index" 168 $ text=~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/144 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 169 145 $self->replace_usemap_links($1, $2, $3)/isge; 170 146 171 $ text=~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/147 $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 172 148 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 173 149 } 174 150 175 151 # trap images 176 $ text=~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/152 $$textref =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 177 153 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 178 179 $doc_obj->add_utf8_text ($cursection, $text);180 181 # add an OID182 $doc_obj->set_OID();183 184 # process the document185 $processor->process($doc_obj);186 187 return 1; # processed the file188 154 } 189 155
Note:
See TracChangeset
for help on using the changeset viewer.