Changeset 1244
- Timestamp:
- 2000-06-27T17:10:07+12:00 (24 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 1 added
- 12 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/acronym.pm
r1242 r1244 27 27 28 28 use strict; 29 use diagnostics;29 #use diagnostics; 30 30 31 31 package acronym; -
trunk/gsdl/perllib/plugin.pm
r1243 r1244 51 51 map { $_ = "\"$_\""; } @$pluginoptions; 52 52 my $options = join (",", @$pluginoptions); 53 $options =~ s/\$/\\\$/g; 53 54 eval ("\$plugobj = new \$pluginname($options)"); 54 55 die "$@" if $@; -
trunk/gsdl/perllib/plugins/ArcPlug.pm
r809 r1244 39 39 } 40 40 41 use strict; 42 41 43 sub new { 42 44 my ($class) = @_; 43 my $self = new BasPlug ( );45 my $self = new BasPlug ("ArcPlug", @_); 44 46 45 47 return bless $self, $class; … … 58 60 sub read { 59 61 my $self = shift (@_); 60 ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;62 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 61 63 62 64 my $count = 0; 63 65 64 66 # see if this has a archives information file within it 65 $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");67 my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf"); 66 68 67 69 if (-e $archive_info_filename) { … … 77 79 78 80 # process each file 79 foreach $subfile (@$file_list) {81 foreach my $subfile (@$file_list) { 80 82 last if ($maxdocs != -1 && $count >= $maxdocs); 81 83 -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1242 r1244 33 33 use doc; 34 34 35 sub print_ usage {35 sub print_general_usage { 36 36 my ($plugin_name) = @_; 37 37 38 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";39 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";40 41 38 print STDERR "\n usage: plugin $plugin_name [options]\n\n"; 42 print STDERR " currently supported general options are:\n";43 39 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 44 40 print STDERR " converted from these encodings and stored internally as\n"; … … 71 67 } 72 68 69 # print_usage should be overridden for any sub-classes having 70 # their own plugin specific options 71 sub print_usage { 72 print STDERR "\nThis plugin has no plugin specific options\n\n"; 73 74 } 75 73 76 sub new { 74 77 my $class = shift (@_); … … 85 88 q^extract_acronyms^, \$self->{'extract_acronyms'}, 86 89 "allow_extra_options")) { 87 &print_usage($plugin_name); 90 91 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n"; 92 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n"; 93 &print_general_usage($plugin_name); 88 94 die "\n"; 89 95 } … … 103 109 # set process_exp and block_exp to defaults unless they were 104 110 # explicitly set 105 if ((!$self->is_recursive()) && 111 112 if ((!$self->is_recursive()) and 106 113 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) { 107 114 108 115 $self->{'process_exp'} = $self->get_default_process_exp (); 109 116 if ($self->{'process_exp'} eq "") { 110 warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";117 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n"; 111 118 } 112 119 } … … 115 122 $self->{'block_exp'} = $self->get_default_block_exp (); 116 123 } 124 125 # handle input_encoding aliases 126 $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1"; 127 $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic"; 117 128 } 118 129 … … 152 163 # process() function and let this read() function keep control. 153 164 # 165 # recursive plugins (e.g. RecPlug) and specialized plugins like those 166 # capable of processing many documents within a single file (e.g. 167 # GMLPlug) should normally implement their own version of read() 168 # 154 169 # Return number of files processed, undef if can't process 155 170 # Note that $base_dir might be "" and that $file might … … 165 180 166 181 my $filename = &util::filename_cat($base_dir, $file); 167 return 0 if $ filename =~ /$self->{'block_exp'}/;182 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; 168 183 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 169 184 return undef; … … 174 189 # create a new document 175 190 my $doc_obj = new doc ($file, "indexed_doc"); 176 my $cursection =177 191 178 192 # read in file ($text will be in utf8) … … 190 204 191 205 # do plugin specific processing of doc_obj 192 $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj); 193 194 # add text 195 $doc_obj->add_utf8_text ($cursection, $text); 206 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); 196 207 197 208 # do any automatic metadata extraction … … 207 218 } 208 219 220 # returns undef if file is rejected by the plugin 209 221 sub process { 210 222 my $self = shift (@_); … … 212 224 213 225 die "Basplug::process function must be implemented in sub-class\n"; 226 227 return undef; # never gets here 214 228 } 215 229 … … 223 237 224 238 $$textref = ""; 225 my $encoding = "";226 if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {227 $encoding = "iso_8859_1";228 } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {229 $encoding = "windows_1256";230 } else {231 $encoding = $self->{'input_encoding'};232 }233 239 234 240 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; 235 241 236 if ($ encodingeq "ascii") {242 if ($self->{'input_encoding'} eq "ascii") { 237 243 undef $/; 238 244 $$textref = <FILE>; … … 241 247 my $reader = new multiread(); 242 248 $reader->set_handle ('BasPlug::FILE'); 243 $reader->set_encoding ($ encoding);249 $reader->set_encoding ($self->{'input_encoding'}); 244 250 $reader->read_file ($textref); 245 251 246 if ($ encodingeq "gb") {252 if ($self->{'input_encoding'} eq "gb") { 247 253 # segment the Chinese words 248 254 $$textref = &cnseg::segment($$textref); -
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r1206 r1244 70 70 } 71 71 72 use strict; 72 73 73 74 # Create a new EMAILPlug object with which to parse a file. … … 77 78 sub new { 78 79 my ($class) = @_; 79 $self = new BasPlug (); 80 my $self = new BasPlug ("EMAILPlug", @_); 81 80 82 return bless $self, $class; 81 83 } 82 84 83 84 # Is EMAILPlug recursive? No. 85 86 sub is_recursive { 87 return 0; 88 } 89 90 91 # Read a file and store its contents in a new document object. 92 # First, we check to see if it is an email message we're dealing 93 # with, then we extract the text and metadata, then we store 94 # all this information. 95 # 96 # Returns: number of files processed or undef if it can't process 97 # a file. This plugin only processes one file at a time. 98 99 sub read { 85 sub get_default_process_exp { 100 86 my $self = shift (@_); 101 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 102 103 # 104 # Check that we're dealig with a valid mail file 105 # 106 107 # Make sure file exists 108 my $filename = &util::filename_cat($base_dir, $file); 109 return undef unless (-e $filename); 110 return undef unless ($filename =~ /\d+(\.email)?$/); 111 112 # Read the text and make sure it is an email message 113 open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n"; 114 my @text = <FILE>; 115 my $text = join("", @text); 116 return undef unless (($text =~ /From:/) || ($text =~ /To:/)); 117 118 print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'}; 87 88 return q^\d+(\.email)?$^; 89 } 90 91 # do plugin specific processing of doc_obj 92 sub process { 93 my $self = shift (@_); 94 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 95 96 # Check that we're dealing with a valid mail file 97 return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/)); 98 99 print STDERR "EMAILPlug: processing $file\n" 100 if $self->{'verbosity'} > 1; 101 102 my $cursection = $doc_obj->get_top_section(); 119 103 120 104 # … … 123 107 124 108 # Separate header from body of message 125 my $Headers = $ text;109 my $Headers = $$textref; 126 110 $Headers =~ s/\n\n.*//s; 127 $ text = substr $text, (length $Headers);111 $$textref = substr $$textref, (length $Headers); 128 112 129 113 # Extract basic metadata from header … … 158 142 159 143 160 # 161 # Create a new document object 162 # 163 164 my $doc_obj = new doc ($file, "indexed_doc"); 165 my $cursection = $doc_obj->get_top_section(); 166 167 # Add specilised metadata 144 # Add extracted metadata to document object 168 145 foreach my $name (keys %raw) { 169 146 $value = $raw{$name}; … … 173 150 $value = "No $name field"; 174 151 } 175 $doc_obj->add_ metadata ($cursection, $name, $value);152 $doc_obj->add_utf8_metadata ($cursection, $name, $value); 176 153 } 177 154 … … 179 156 $Headers = &text_into_html($Headers); 180 157 $Headers = "No headers" unless ($Headers =~ /\w/); 181 $doc_obj->add_metadata ($cursection, "Headers", $Headers); 182 183 # Add document text 184 $text = &text_into_html($text); 185 $text = "No message" unless ($text =~ /\w/); 186 $doc_obj->add_text ($cursection, $text); 187 188 # Add the OID - that is, the big HASH value used as a unique ID 189 $doc_obj->set_OID (); 190 191 # Process the document 192 $processor->process($doc_obj); 193 194 # Return the number of documents processed 195 return 1; 196 158 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers); 159 160 # Add text to document object 161 $$textref = &text_into_html($$textref); 162 $$textref = "No message" unless ($$textref =~ /\w/); 163 $doc_obj->add_utf8_text($cursection, $$textref); 164 165 return 1; 197 166 } 198 167 … … 213 182 my ($text) = @_; 214 183 215 # Convert problem chara ters into HTML symbols184 # Convert problem characters into HTML symbols 216 185 $text =~ s/&/&/go; 217 186 $text =~ s/</</go; … … 236 205 # Perl packages have to return true if they are run. 237 206 1; 238 239 240 241 242 243 244 -
trunk/gsdl/perllib/plugins/GMLPlug.pm
r1010 r1244 37 37 } 38 38 39 use strict; 40 39 41 sub new { 40 42 my ($class) = @_; 41 $self = new BasPlug ();43 my $self = new BasPlug ("GMLPlug", @_); 42 44 43 45 return bless $self, $class; 44 46 } 45 47 46 47 sub is_recursive { 48 sub get_default_process_exp { 48 49 my $self = shift (@_); 49 50 50 return 0; # this is not a recursive plugin 51 } 52 53 sub _unescape_text { 54 my ($text) = @_; 55 56 # special characters in the gml encoding 57 $text =~ s/</</g; 58 $text =~ s/>/>/g; 59 $text =~ s/"/\"/g; 60 $text =~ s/&/&/g; # this has to be last... 61 62 return $text; 51 return q^(?i)\.gml(\.gz)?$^; 63 52 } 64 53 … … 69 58 my $self = shift (@_); 70 59 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 71 my $fullname = &util::filename_cat ($base_dir, $file); 72 73 # see if this is a gml book 74 return undef unless (-f $fullname && $fullname =~ /\.gml(\.gz)?$/io); 75 76 my ($parent_dir, $gz) = $fullname =~ /^(.*?)[\/\\][^\/\\]+.gml(\.gz)?$/io; 77 78 if (defined $gz && $gz =~ /\.gz/io) { 60 61 my $filename = &util::filename_cat($base_dir, $file); 62 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; 63 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 64 return undef; 65 } 66 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 67 68 print STDERR "GMLPlug: processing $file\n"; 69 70 my $parent_dir = $file; 71 $parent_dir =~ s/[^\\\/]*$//; 72 $parent_dir = &util::filename_cat ($base_dir, $parent_dir); 73 74 # all this gzip stuff should one day be replaced by a gzip/bzip/zip/tar 75 # handling plugin 76 my $gz = 0; 77 if ($file =~ /\.gz$/i) { 79 78 $gz = 1; 80 } else {81 $gz = 0;82 79 } 83 80 84 print STDERR "GMLPlug: processing $file\n"; 85 86 # read in the document 81 # read in the document - input is assumed throughout this plugin to already be utf8 87 82 if ($gz) { 88 if (!open (INFILE, "zcat $f ullname |")) {89 print STDERR "GMLPlug::read - zcat couldn't read $f ullname\n";90 return undef;83 if (!open (INFILE, "zcat $filename |")) { 84 print STDERR "GMLPlug::read - zcat couldn't read $filename\n"; 85 return 0; 91 86 } 92 87 } else { 93 if (!open (INFILE, $f ullname)) {94 print STDERR "GMLPlug::read - couldn't read $f ullname\n";95 return undef;88 if (!open (INFILE, $filename)) { 89 print STDERR "GMLPlug::read - couldn't read $filename\n"; 90 return 0; 96 91 } 97 92 } … … 106 101 107 102 my $no_docs = 0; 108 # my $src_filename = ""; #### don't appear to use this anymore - not sure if that's right109 103 110 104 while (1) { … … 128 122 129 123 } else { 130 print STDERR "GMLPlug::read - error in file $f ullname\n";124 print STDERR "GMLPlug::read - error in file $filename\n"; 131 125 print STDERR "text: \"$gml\"\n"; 132 126 last; … … 166 160 last if $section eq ""; # back to top level again (more than one document in gml file) 167 161 $section = $doc_obj->get_parent_section ($section); 168 } # while (1) section level162 } # while (1) section level 169 163 170 164 # add the associated files 171 $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");165 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile"); 172 166 my ($assoc_file_info, $afile); 173 167 foreach $assoc_file_info (@$assoc_files) { … … 186 180 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 187 181 188 # assume the document has an OID 182 # do any automatic metadata extraction 183 $self->auto_extract_metadata ($doc_obj); 184 185 # assume the document has an OID already 189 186 190 187 # process the document … … 194 191 last if ($maxdocs > -1 && $no_docs >= $maxdocs); 195 192 last unless defined $gml && $gml =~ /\w/; 196 } # while(1) document level193 } # while(1) document level 197 194 198 195 return $no_docs; # no of docs processed 199 196 } 200 197 198 sub _unescape_text { 199 my ($text) = @_; 200 201 # special characters in the gml encoding 202 $text =~ s/</</g; 203 $text =~ s/>/>/g; 204 $text =~ s/"/\"/g; 205 $text =~ s/&/&/g; # this has to be last... 206 207 return $text; 208 } 201 209 202 210 1; -
trunk/gsdl/perllib/plugins/HBPlug.pm
r1020 r1244 24 24 ########################################################################### 25 25 26 # plugin which process an HTML book directory 26 # plugin which processes an HTML book directory 27 28 # This plugin is used by the Humanity Library collections and does not handle 29 # input encodings other than ascii or extended ascii 30 31 # this code is kind of ugly and could no doubt be made to run faster, by leaving 32 # it in this state I hope to encourage people to make their collections use 33 # HBSPlug instead ;-) 34 35 # Use HBSPlug if creating a new collection and marking up files like the 36 # Humanity Library collections. HBSPlug accepts all input encodings but 37 # expects the marked up files to be cleaner than those used by the 38 # Humanity Library collections 27 39 28 40 package HBPlug; 29 41 30 use plugin;31 42 use ghtml; 32 43 use BasPlug; 33 44 use util; 34 use lang;35 45 use doc; 36 use cfgread;37 46 38 47 … … 43 52 sub new { 44 53 my ($class) = @_; 45 $self = new BasPlug ();54 my $self = new BasPlug ("HBPlug", @_); 46 55 47 56 return bless $self, $class; 48 57 } 49 58 50 sub is_recursive { 51 my $self = shift (@_); 52 53 return 0; # this is not a recursive plugin 54 } 59 sub init { 60 my $self = shift (@_); 61 my ($verbosity) = @_; 62 63 $self->BasPlug::init(); 64 65 # this plugin only handles ascii encodings 66 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) { 67 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" . 68 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n"; 69 } 70 } 71 72 # this is included only to prevent warnings being printed out 73 # from BasPlug::init. The process_exp is not used by this plugin 74 sub get_default_process_exp { 75 my $self = shift (@_); 76 77 return "This plugin does not use a process_exp\n"; 78 } 79 55 80 56 81 sub HB_read_html_file { … … 65 90 66 91 my $foundbody = 0; 67 $self->HB_gettext (\$foundbody, $text, FILE);92 $self->HB_gettext (\$foundbody, $text, "FILE"); 68 93 close FILE; 69 94 … … 72 97 $foundbody = 1; 73 98 open (FILE, $htmlfile) || return; 74 $self->HB_gettext (\$foundbody, $text, FILE);99 $self->HB_gettext (\$foundbody, $text, "FILE"); 75 100 close FILE; 76 101 } … … 159 184 } 160 185 186 # if input_encoding is ascii we can call add_utf8_metadata 187 # directly but if it's iso_8859_1 (the default) we need to call 188 # add_metadata so that the ascii2utf8 conversion is done first 189 # this should speed things up a little if processing an ascii only 190 # document with input_encoding set to ascii 191 sub HB_add_metadata { 192 my $self = shift (@_); 193 my ($doc_obj, $cursection, $field, $value) = @_; 194 195 if ($self->{'input_encoding'} eq "ascii") { 196 $doc_obj->add_utf8_metadata ($cursection, $field, $value); 197 } else { 198 $doc_obj->add_metadata ($cursection, $field, $value); 199 } 200 } 161 201 162 202 # return number of files processed, undef if can't process … … 192 232 193 233 # add metadata for top level of document 194 foreach $field (keys(%$metadata)) {234 foreach my $field (keys(%$metadata)) { 195 235 # $metadata->{$field} may be an array reference 196 236 if (ref ($metadata->{$field}) eq "ARRAY") { 197 237 map { 198 $ doc_obj->add_metadata ($cursection, $field, $_);238 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_); 199 239 } @{$metadata->{$field}}; 200 240 } else { 201 $ doc_obj->add_metadata ($cursection, $field, $metadata->{$field});241 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field}); 202 242 } 203 243 } … … 240 280 241 281 # add the metadata to this section 242 $ doc_obj->add_metadata ($cursection, "Title", $title);282 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title); 243 283 244 284 # clean up the section html … … 251 291 252 292 # add the text for this section 253 $doc_obj->add_text ($cursection, $sectiontext); 254 293 if ($self->{'input_encoding'} eq "ascii") { 294 $doc_obj->add_utf8_text ($cursection, $sectiontext); 295 } else { 296 $doc_obj->add_text ($cursection, $sectiontext); 297 } 255 298 } else { 256 299 print STDERR "WARNING - leftover text\n" , $self->shorten($html), -
trunk/gsdl/perllib/plugins/HBSPlug.pm
r1235 r1244 32 32 # processing of html links or any other HTMLPlug type stuff is done). 33 33 34 # expects input files to have a .hb file extension 34 # expects input files to have a .hb file extension by default (this can be 35 # changed by adding a -process_exp option 35 36 36 37 # a file with the same name as the hb file but a .jpg extension is 37 # taken as the cover image 38 # taken as the cover image (jpg files are blocked by this plugin) 38 39 39 40 # HBSPlug is a simplification (and extension of) the HBPlug used … … 59 60 sub new { 60 61 my ($class) = @_; 61 my $self = new BasPlug ( @_);62 my $self = new BasPlug ("HBSPlug", @_); 62 63 63 64 return bless $self, $class; 64 65 } 65 66 66 sub is_recursive{67 sub get_default_block_exp { 67 68 my $self = shift (@_); 68 69 69 return 0; # this is not a recursive plugin 70 } 71 72 73 # return number of files processed, undef if can't process 74 # Note that $base_dir might be "" and that $file might 75 # include directories 76 sub read { 70 return q^\.jpg$^; 71 } 72 73 sub get_default_process_exp { 77 74 my $self = shift (@_); 78 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 75 76 return q^(?i)\.hb$^; 77 } 78 79 # do plugin specific processing of doc_obj 80 sub process { 81 my $self = shift (@_); 82 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 83 84 print STDERR "HBSPlug: processing $file\n" 85 if $self->{'verbosity'} > 1; 86 87 my $cursection = $doc_obj->get_top_section(); 79 88 80 89 my $filename = &util::filename_cat($base_dir, $file); … … 82 91 $absdir =~ s/[^\/\\]*$//; 83 92 84 return 0 if ($filename =~ /\.jpg$/i);85 return undef unless ($filename =~ /\.hb$/i && (-e $filename));86 87 print STDERR "HBSPlug: processing $filename\n" if $processor->{'verbosity'};88 89 # create a new document90 my $doc_obj = new doc ($file, "indexed_doc");91 my $cursection = $doc_obj->get_top_section();92 93 93 # add the cover image 94 94 my $coverimage = $filename; 95 $coverimage =~ s/\. hb/\.jpg/i;95 $coverimage =~ s/\.[^\.]*$/\.jpg/i; 96 96 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg"); 97 97 98 # add metadata for top level of document99 $self->extra_metadata ($doc_obj, $cursection, $metadata);100 101 # read in HTML file ($text will be in utf8)102 my $text = "";103 $self->read_file ($filename, \$text);104 105 98 my $title = ""; 106 99 107 100 # remove any leading rubbish 108 $ text=~ s/^.*?(<<TOC)/$1/ios;101 $$textref =~ s/^.*?(<<TOC)/$1/ios; 109 102 110 103 my $curtoclevel = 1; 111 104 my $firstsection = 1; 112 105 my $toccount = 0; 113 while ($ text=~ /\w/) {114 $ text=~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;106 while ($$textref =~ /\w/) { 107 $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios; 115 108 my $toclevel = $1; 116 109 my $metadata = $2; … … 166 159 $firstsection = 0; 167 160 168 $text =~ s/^\s+//s; 169 } 170 171 # add OID 172 $doc_obj->set_OID (); 173 174 # process the document 175 $processor->process($doc_obj); 176 177 return 1; # processed the file 161 $$textref =~ s/^\s+//s; 162 } 163 164 return 1; 178 165 } 179 166 -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r1243 r1244 50 50 51 51 sub print_usage { 52 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";53 54 52 print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; 55 53 print STDERR " options:\n"; … … 64 62 print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n"; 65 63 print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n"; 66 print STDERR " w3mir \n";67 64 print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n"; 68 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf) $'\n";65 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n"; 69 66 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n"; 70 67 print STDERR " creates much shallower directory structure (useful when creating\n"; … … 85 82 q^rename_assoc_files^, \$self->{'rename_assoc_files'}, 86 83 "allow_extra_options")) { 87 84 85 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n"; 88 86 &print_usage(); 89 87 die "\n"; … … 152 150 $$textref =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ 153 151 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 152 153 # add text to document object 154 $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>"); 155 156 return 1; 154 157 } 155 158 -
trunk/gsdl/perllib/plugins/IndexPlug.pm
r809 r1244 54 54 use plugin; 55 55 use BasPlug; 56 use lang;57 56 use doc; 58 57 use util; … … 63 62 } 64 63 64 use strict; 65 65 66 sub new { 66 67 my ($class) = @_; 67 $self = new BasPlug ();68 my $self = new BasPlug ("IndexPlug", @_); 68 69 69 70 return bless $self, $class; … … 76 77 return 1; 77 78 } 78 79 79 80 80 # return number of files processed, undef if can't process … … 104 104 # process each document 105 105 my $count = 0; 106 foreach $docfile (keys (%$list)) {106 foreach my $docfile (keys (%$list)) { 107 107 last if ($maxdocs != -1 && $count >= $maxdocs); 108 108 $metadata = {}; # at present we can do this as metadata … … 113 113 # note that $list->{$docfile} is an array reference 114 114 if ($docfile !~ /key:/i) { 115 my $i = 0; 115 116 for ($i = 0; $i < scalar (@{$list->{$docfile}}); $i ++) { 116 117 if ($list->{$docfile}->[$i] =~ /^<([^>]+)>(.+)$/) { -
trunk/gsdl/perllib/plugins/RecPlug.pm
r809 r1244 38 38 } 39 39 40 use strict; 41 40 42 sub new { 41 43 my ($class) = @_; 42 my $self = new BasPlug ( );44 my $self = new BasPlug ("RecPlug", @_); 43 45 44 46 $self->{'exclude_tail_dirs'} = []; # empty by default … … 62 64 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 63 65 64 foreach $etd ( @{$self->{'exclude_tail_dirs'}} )66 foreach my $etd ( @{$self->{'exclude_tail_dirs'}} ) 65 67 { 66 68 return 0 if ($file =~ m/$etd/); … … 72 74 73 75 # see if this is a directory 74 $dirname = &util::filename_cat ($base_dir, $file);76 my $dirname = &util::filename_cat ($base_dir, $file); 75 77 if (-d $dirname) { 76 78 -
trunk/gsdl/perllib/plugins/TEXTPlug.pm
r732 r1244 24 24 ########################################################################### 25 25 26 # creates simple single-level document from .txt or .text files 27 # (case-insensitive match on filenames). Adds Title metadata 28 # of first 100 characters found. 26 # creates simple single-level document. Adds Title metadata 27 # of first line of text (up to 100 characters long). 29 28 30 29 package TEXTPlug; 31 30 32 31 use BasPlug; 33 use sorttools;34 32 35 33 sub BEGIN { … … 37 35 } 38 36 37 use strict; 38 39 39 sub new { 40 40 my ($class) = @_; 41 $self = new BasPlug ();41 my $self = new BasPlug ("TEXTPlug", @_); 42 42 43 43 return bless $self, $class; 44 44 } 45 45 46 sub is_recursive{46 sub get_default_process_exp { 47 47 my $self = shift (@_); 48 48 49 return 0; # this is not a recursive plugin49 return q^(?i)\.te?xt$^; 50 50 } 51 51 52 53 # return number of files processed, undef if can't process 54 # Note that $base_dir might be "" and that $file might 55 # include directories 56 sub read { 52 # do plugin specific processing of doc_obj 53 sub process { 57 54 my $self = shift (@_); 58 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 59 60 my $filename = &util::filename_cat($base_dir, $file); 61 62 return undef unless ($filename =~ /\.(te?xt(\.gz)?)$/i && (-e $filename)); 63 64 my $gz = 0; 65 if (defined $2) { 66 $gz = $2; 67 $gz = 1 if ($gz =~ /\.gz/i); 68 } 69 70 print STDERR "TEXTPlug: processing $filename\n" if $processor->{'verbosity'}; 71 72 # create a new document 73 my $doc_obj = new doc ($file, "indexed_doc"); 74 75 if ($gz) { 76 open (FILE, "zcat $filename |") || die "TEXTPlug::read - zcat can't open $filename\n"; 77 } else { 78 open (FILE, $filename) || die "TEXTPlug::read - can't open $filename\n"; 79 } 55 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 56 57 print STDERR "TEXTPlug: processing $file\n" 58 if $self->{'verbosity'} > 1; 59 80 60 my $cursection = $doc_obj->get_top_section(); 81 82 my $text = ""; 83 my $line = ""; 84 my $foundtitle = 0; 85 # don't need to get title if it has been passed 86 # in from another plugin 87 if (defined $metadata->{'Title'}) { 88 $foundtitle = 1; 89 } 90 while (defined ($line = <FILE>)) { 91 # use first line as title (or first 100 characters if it's long) 92 if (!$foundtitle && length($line) > 5) { 93 my $title = ""; 94 if (length($line) > 100) { 95 $title = substr ($line, 0, 100); 96 } else { 97 $title = $line; 98 } 99 $doc_obj->add_metadata ($cursection, "Title", $title); 100 $foundtitle = 1; 61 62 # get title metadata 63 # (don't need to get title if it has been passed 64 # in from another plugin) 65 if (!defined $metadata->{'Title'}) { 66 my ($title) = $$textref =~ /^([^\n]*)/; 67 if (length($title) > 100) { 68 $title = substr ($title, 0, 100); 101 69 } 102 $ text .= $line;70 $doc_obj->add_utf8_metadata ($cursection, "Title", $title); 103 71 } 104 72 105 $doc_obj->add_text ($cursection, "<pre>\n$text\n</pre>"); 73 # insert preformat tags and add text to document object 74 $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>"); 106 75 107 108 foreach $field (keys(%$metadata)) { 109 # $metadata->{$field} may be an array reference 110 if (ref ($metadata->{$field}) eq "ARRAY") { 111 map { 112 $doc_obj->add_metadata ($cursection, $field, $_); 113 } @{$metadata->{$field}}; 114 } else { 115 $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); 116 } 117 } 118 119 # add OID 120 $doc_obj->set_OID (); 121 122 # process the document 123 $processor->process($doc_obj); 124 125 return 1; # processed the file 76 return 1; 126 77 } 127 78
Note:
See TracChangeset
for help on using the changeset viewer.