Changeset 1219 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2000-06-21T10:14:14+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r839 r1219 26 26 package BasPlug; 27 27 28 use parsargv; 29 use multiread; 30 use cnseg; 31 use strict; 32 33 sub print_usage { 34 print STDERR "\nOne of your plugins uses an incorrect general option (general options are those\n"; 35 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n"; 36 37 print STDERR "\n usage: plugin plugin-name [options]\n\n"; 38 print STDERR " currently supported general options are:\n"; 39 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 40 print STDERR " converted from these encodings and stored internally as\n"; 41 print STDERR " utf8. The default input_encoding is Latin1. Accepted values\n"; 42 print STDERR " are:\n"; 43 print STDERR " iso_8859_1 (extended ascii)\n"; 44 print STDERR " Latin1 (the same as iso-8859-1)\n"; 45 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n"; 46 print STDERR " conversion is neccessary)\n"; 47 print STDERR " gb (GB or GBK simplified Chinese)\n"; 48 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 49 print STDERR " Arabic (the same as iso-8859-6)\n"; 50 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 51 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; 52 print STDERR " detection)\n\n"; 53 } 28 54 29 55 sub new { 30 my ($class) = @_;56 my $class = shift (@_); 31 57 32 return bless {}, $class; 58 my $self = {}; 59 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|Arabic|utf8|unicode)\$"; 60 61 # general options available to all plugins 62 if (!parsargv::parse(\@_, "input_encoding/$encodings/Latin1", \$self->{'input_encoding'}, 63 "allow_extra_options")) { 64 &print_usage(); 65 die "\n"; 66 } 67 68 return bless $self, $class; 33 69 } 34 70 … … 61 97 } 62 98 63 sub extra_metadata 64 { 65 my ($self,$doc_obj,$cursection, $metadata) = @_; 99 # uses the multiread package to read in the entire file pointed to 100 # by filename and loads the resulting text into $$textref. Input text 101 # may be in any of the encodings handled by multiread, output text 102 # will be in utf8 103 sub read_file { 104 my $self = shift (@_); 105 my ($filename, $textref) = @_; 66 106 67 foreach $field (keys(%$metadata)) { 107 $$textref = ""; 108 my $encoding = ""; 109 if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) { 110 $encoding = "latin1"; 111 } elsif ($self->{'input_encoding'} =~ /^(Arabic|iso_8859_6)$/) { 112 $encoding = "arabic"; 113 } else { 114 $encoding = $self->{'input_encoding'}; 115 } 116 117 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; 118 119 if ($encoding eq "ascii") { 120 undef $/; 121 $$textref = <FILE>; 122 $/ = "\n"; 123 } else { 124 my $reader = new multiread(); 125 $reader->set_handle ('BasPlug::FILE'); 126 $reader->set_encoding ($encoding); 127 $reader->read_file ($textref); 128 129 if ($encoding eq "gb") { 130 # segment the Chinese words 131 $$textref = &cnseg::segment($$textref); 132 } 133 } 134 135 close FILE; 136 } 137 138 # add any extra metadata that's been passed around from one 139 # plugin to another. 140 # extra_metadata uses add_utf8_metadata so it expects metadata values 141 # to already be in utf8 142 sub extra_metadata { 143 my $self = shift (@_); 144 my ($doc_obj, $cursection, $metadata) = @_; 145 146 foreach my $field (keys(%$metadata)) { 68 147 # $metadata->{$field} may be an array reference 69 148 if (ref ($metadata->{$field}) eq "ARRAY") { 70 149 map { 71 $doc_obj->add_ metadata ($cursection, $field, $_);150 $doc_obj->add_utf8_metadata ($cursection, $field, $_); 72 151 } @{$metadata->{$field}}; 73 152 } else { 74 $doc_obj->add_ metadata ($cursection, $field, $metadata->{$field});153 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field}); 75 154 } 76 155 }
Note:
See TracChangeset
for help on using the changeset viewer.