Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1844

Timestamp:

2001-01-19T10:35:13+13:00 (23 years ago)

Author:

sjboddie

Message:

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

Location:

trunk/gsdl/perllib

Files:

: 6 edited

doc.pm (modified) (1 diff)
docsave.pm (modified) (1 diff)
multiread.pm (modified) (6 diffs)
plugins/BasPlug.pm (modified) (20 diffs)
plugins/HTMLPlug.pm (modified) (5 diffs)
unicode.pm (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/doc.pm

-              r1732
+              r1844
+}
+sub set_source_encoding {
+    my $self = shift (@_);
+    my ($source_encoding) = @_;
+    $self->set_metadata_element ($self->get_top_section(),
+                 "gsdlsourceencoding",
+                 $source_encoding);
+}
+# returns the source_encoding as it was provided
+sub get_source_encoding {
+    my $self = shift (@_);
+    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourceencoding");
+}
 sub _escape_text {
     my ($text) = @_;

trunk/gsdl/perllib/docsave.pm

r1454	r1844
239	239	"gsdlassocfile",
240	240	"$afile:$assoc_file->[2]:$dir");
241		} else {
	241	} elsif ($self->{'verbosity'} > 2) {
242	242	print $outhandle "docsave::process couldn't copy the associated file " .
243	243	"$assoc_file->[0] to $afile\n";

trunk/gsdl/perllib/multiread.pm

-              r1838
+              r1844
 # encodings currently supported are
+#
+# utf8         - either utf8 or unicode (automatically detected)
+# unicode      - just unicode (doesn't currently do endian detection)
+# gb           - GB
+# iso_8859_1   - extended ascii (iso-8859-1)
+# iso_8859_6   - 8 bit arabic (iso-8859-6)
+# windows_1256 - Windows codepage 1256 (Arabic)
+# windows_1251 - Windows codepage 1251 (Cyrillic)
+# utf8             - either utf8 or unicode (automatically detected)
+# unicode          - just unicode (doesn't currently do endian detection)
+# gb               - GB
+# iso_8859_[1-9]   - 8 bit extended ascii encodings
+# windows_125[0-6] - Windows codepages 1250 to 1256
 package multiread;
 …
     if ($self->{'encoding'} eq "iso_8859_1") {
+    # Latin 1 extended ascii (ISO-8859-1)
+    # special case for iso_8859_1 as &ascii2utf8($char) is faster than
+    # &unicode2utf8(iso2unicode('1', $char))
     return undef if (eof ($handle));
     return &unicode::ascii2utf8 (getc ($handle));
+    }
+    if ($self->{'encoding'} eq "iso_8859_6") {
+    # 8 bit Arabic (IOS-8859-6)
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
+    }
+    if ($self->{'encoding'} eq "windows_1256") {
+    # Windows 1256 (Arabic)
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
+    }
+    if ($self->{'encoding'} eq "windows_1251") {
+    # Windows 1251 (Cyrillic)
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", getc ($handle)));
+    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle)));
+    }
+    if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle)));
+    }
+    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle)));
+    }
 …
     if ($self->{'encoding'} eq "iso_8859_1") {
+    # extended ascii (ISO-8859-1)
+    # special case for iso_8859_1 as &ascii2utf8($line) is faster than
+    # &unicode2utf8(iso2unicode('1', $line))
     my $line = "";
     if (defined ($line = <$handle>)) {
 …
+    }
+    if ($self->{'encoding'} eq "iso_8859_6") {
+    # 8 bit arabic (ISO-8859-6)
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} eq "windows_1256") {
+    # Windows 1256 (Arabic)
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} eq "windows_1251") {
+    # Windows 1251 (Cyrillic)
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $line));
+    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
+    }
     return undef;
 …
     if ($self->{'encoding'} eq "iso_8859_1") {
+    # special case for iso_8859_1 as &ascii2utf8($text) is faster than
+    # &unicode2utf8(iso2unicode('1', $text))
     undef $/;
     my $text = <$handle>;
 …
+    }
+    if ($self->{'encoding'} eq "iso_8859_6") {
+    my $text = <$handle>;
+    undef $/;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
+    return;
+    }
+    if ($self->{'encoding'} eq "windows_1256") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
+    return;
+    }
+    if ($self->{'encoding'} eq "windows_1251") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $text));
+    return;
+    }
+}
+    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text));
+    return;
+    }
+    if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text));
+    return;
+    }
+    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text));
+    return;
+    }
+}
 ;

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1838
+              r1844
 use diagnostics;
 use DateExtract;
+use iso639;
+# if textcat returns an encoding that isn't in this list
+# we'll print a warning and use the default encoding instead
+%supported_encodings = (
+            "ascii" => "",
+            "iso_8859_1" => "",
+            "windows_1252" => "",
+            "iso_8859_2" => "",
+            "windows_1250" => "",
+            "iso_8859_3" => "",
+            "iso_8859_4" => "",
+            "iso_8859_5" => "",
+            "windows_1251" => "",
+            "koi8_r" => "",
+            "koi8_u" => "",
+            "iso_8859_6" => "",
+            "windows_1256" => "",
+            "iso_8859_7" => "",
+            "windows_1253" => "",
+            "iso_8859_8" => "",
+            "windows_1255" => "",
+            "iso_8859_9" => "",
+            "windows_1254" => "",
+            "gb" => ""
+            );
 sub print_general_usage {
 …
     print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
+    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
+    print STDERR "                     converted from these encodings and stored internally as\n";
+    print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
+    print STDERR "                     are:\n";
+    print STDERR "                        iso_8859_1 (extended ascii)\n";
+    print STDERR "                        Latin1 (the same as iso-8859-1)\n";
+    print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
+    print STDERR "                               conversion is neccessary)\n";
+    print STDERR "                        gb (GB or GBK simplified Chinese)\n";
+    print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
+    print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
+    print STDERR "                        Arabic (the same as windows_1256)\n";
+    print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
+    print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
+    print STDERR "                                 detection)\n";
+    print STDERR "                        windows_1251 (Windows codepage 1251 (Cyrillic))\n";
     print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
     print STDERR "                     Matching filenames will be processed by this plugin.\n";
     print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
     print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
+    print STDERR "                     .htm or .html (case-insensitive).\n";
+    print STDERR "                     .htm or .html (case-insensitive).\n\n";
     print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
     print STDERR "                     being passed to any further plugins in the list. This has no\n";
 …
     print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
     print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
+    print STDERR "                     file extensions.\n";
+    print STDERR "                     file extensions.\n\n";
+    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
+    print STDERR "                     converted from these encodings and stored internally as\n";
+    print STDERR "                     utf8. The default input_encoding is 'auto'. Accepted values\n";
+    print STDERR "                     are:\n";
+    print STDERR "                       auto: Use text categorization algorithm to automatically\n";
+    print STDERR "                         identify the encoding of each source document. This\n";
+    print STDERR "                         will be slower than explicitly setting the encoding\n";
+    print STDERR "                         but will work where more than one encoding is used\n";
+    print STDERR "                         within the same collection.\n";
+    print STDERR "                       ascii: Plain 7 bit ascii. This may be a little faster than\n";
+    print STDERR "                         using iso_8859_1. Beware of using 'ascii' on a collection\n";
+    print STDERR "                         of documents that may contain characters outside of plain\n";
+    print STDERR "                         7 bit ascii though (e.g. German or French documents\n";
+    print STDERR "                         containing accents), use iso_8859_1 instead.\n";
+    print STDERR "                       utf8: either utf8 or unicode -- automatically detected\n";
+    print STDERR "                       unicode: just unicode\n";
+    print STDERR "                       iso_8859_1: Latin1 (western european languages)\n";
+    print STDERR "                       windows_1252: Windows codepage 1252 (WinLatin1)\n";
+    print STDERR "                       iso_8859_2: Latin2 (central and eastern european languages)\n";
+    print STDERR "                       windows_1250: Windows codepage 1250 (WinLatin2)\n";
+    print STDERR "                       iso_8859_3: Latin3\n";
+    print STDERR "                       iso_8859_4: Latin4\n";
+    print STDERR "                       iso_8859_5: Cyrillic\n";
+    print STDERR "                       windows_1251: Windows codepage 1251 (WinCyrillic)\n";
+    print STDERR "                       koi8_r: Cyrillic - Russian\n";
+    print STDERR "                       koi8_u: Cyrillic - Ukrainian\n";
+    print STDERR "                       iso_8859_6: Arabic\n";
+    print STDERR "                       windows_1256: Windows codepage 1256 (WinArabic)\n";
+    print STDERR "                       iso_8859_7: Greek\n";
+    print STDERR "                       windows_1253: Windows codepage 1253 (WinGreek)\n";
+    print STDERR "                       iso_8859_8: Hebrew\n";
+    print STDERR "                       windows_1255: Windows codepage 1255 (WinHebrew)\n";
+    print STDERR "                       iso_8859_9: Latin5\n";
+    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
+    print STDERR "                       gb: GB or GBK simplified Chinese\n\n";
+    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
+    print STDERR "                     algorithm fails to extract the encoding or extracts an encoding\n";
+    print STDERR "                     that is not supported by Greenstone, this encoding will be used\n";
+    print STDERR "                     instead. The default is iso_8859_1\n\n";
+    print STDERR "   -extract_language Identify the language of each document and set 'Language' metadata. Note\n";
+    print STDERR "                     that this will be done automatically if -input_encoding is 'auto'.\n";
+    print STDERR "   -default_language If Greenstone fails to work out what language a document is the\n";
+    print STDERR "                     'Language' metadata element will be set to this value. The default\n";
+    print STDERR "                     is 'en' (ISO 639 language symbols should be used - en = English).\n";
+    print STDERR "                     Note that if -input_encoding is not set to 'auto' and -extract_language\n";
+    print STDERR "                     is not set, all documents will have their 'Language' metadata set to\n";
+    print STDERR "                     this value.\n\n";
     print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
+    print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
+    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
+    print STDERR "   -first            Comma seperated list of first sizes to extract from the text \n";
+    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n";
+    print STDERR "   -markup_acronyms  Add acronym metadata into document text\n\n";
+    print STDERR "   -first            Comma seperated list of first sizes to extract from the text\n";
+    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n\n";
     print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
     print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n\n";
+}
 …
     my $class = shift (@_);
     my $plugin_name = shift (@_);
     my $self = {};
+    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode|windows_1251)\$";
+    my $enc = "^(";
+    map {$enc .= "|$_";} keys %supported_encodings;
+    my $denc = $enc . "|utf8|unicode)\$";
+    $enc .= "|utf8|unicode|auto)\$";
     $self->{'outhandle'} = STDERR;
     my $year = (localtime)[5]+1900;
 …
     # general options available to all plugins
     if (!parsargv::parse(\@_,
-             qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
              q^process_exp/.*/^, \$self->{'process_exp'},
              q^block_exp/.*/^, \$self->{'block_exp'},
+             qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
+             qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
              q^extract_acronyms^, \$self->{'extract_acronyms'},
              q^extract_email^, \$self->{'extract_email'},
              q^markup_acronyms^, \$self->{'markup_acronyms'},
              q^extract_language^, \$self->{'extract_language'},
+             q^default_language/.{2}/en^, \$self->{'default_language'},
              q^first/.*/^, \$self->{'first'},
              q^extract_date^, \$self->{'date_extract'},
              "maximum_date/\\d{4}/$year", \$self->{'max_year'},
+             qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
              q^no_bibliography^, \$self->{'no_biblio'},
+             "maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1",
+             \$self->{'max_century'},
+             qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
              "allow_extra_options")) {
 …
     $self->{'block_exp'} = $self->get_default_block_exp ();
+    }
-    # handle input_encoding aliases
-    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
-    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
+}
 …
+    }
+    my $outhandle = $self->{'outhandle'};
     my $filename = &util::filename_cat($base_dir, $file);
     return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
 …
     my $plugin_name = ref ($self);
     $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
+    my ($language, $encoding);
+    if ($self->{'input_encoding'} eq "auto") {
+    # use textcat to automatically work out the input encoding and language
+    ($language, $encoding) = $self->get_language_encoding ($filename);
+    } elsif ($self->{'extract_language'}) {
+    # use textcat to get language metadata
+    ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
+    $encoding = $self->{'input_encoding'};
+    if ($extracted_encoding != $encoding && $self->{'verbosity'}) {
+        print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
+        print $outhandle "appears to be encoded as $extracted_encoding.";
+    }
+    } else {
+    $language = $self->{'default_language'};
+    $encoding = $self->{'input_encoding'};
+    }
     # create a new document
     my $doc_obj = new doc ($filename, "indexed_doc");
+    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
+    $doc_obj->set_source_encoding ($encoding);
     # read in file ($text will be in utf8)
     my $text = "";
+    $self->read_file ($filename, \$text);
+    if ($text !~ /\w/) {
+    my $outhandle = $self->{'outhandle'};
+    $self->read_file ($filename, $encoding, \$text);
+    if (!length ($text)) {
     print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
     return 0;
 …
 sub read_file {
     my $self = shift (@_);
     my ($filename, $textref) = @_;
+    my ($filename, $encoding, $textref) = @_;
     if (!-r $filename)
+    {
+    print STDERR "Read permission denied for $filename\n";
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
     return;
+    }
 …
     open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
     if ($self->{'input_encoding'} eq "ascii") {
+    if ($encoding eq "ascii") {
     undef $/;
     $$textref = <FILE>;
 …
     my $reader = new multiread();
     $reader->set_handle ('BasPlug::FILE');
     $reader->set_encoding ($self->{'input_encoding'});
+    $reader->set_encoding ($encoding);
     $reader->read_file ($textref);
     if ($self->{'input_encoding'} eq "gb") {
+    if ($encoding eq "gb") {
         # segment the Chinese words
         $$textref = &cnseg::segment($$textref);
 …
     close FILE;
+}
+# Uses textcat to work out the encoding and language of the text in
+# $filename. All html tags are removed before processing.
+# returns an array containing "language" and "encoding"
+sub get_language_encoding {
+    my $self = shift (@_);
+    my ($filename) = @_;
+    my $outhandle = $self->{'outhandle'};
+    # read in file
+    open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
+    undef $/;
+    my $text = <FILE>;
+    $/ = "\n";
+    close FILE;
+    # remove all HTML tags
+    $text =~ s/<[^>]*>//sg;
+    # get the language/encoding
+    my @results = textcat::classify($text);
+#    foreach $i (@results) {
+#   print STDERR "i: $i\n";
+#    }
+    if (scalar @results != 1) {
+    if ($self->{'input_encoding'} ne 'auto') {
+        if ($self->{'extract_language'} && $self->{'verbosity'}) {
+        print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
+        print $outhandle "defaulting to $self->{'default_language'}\n";
+        }
+        return ($self->{'default_language'}, $self->{'input_encoding'});
+    } else {
+        if ($self->{'verbosity'}) {
+        print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
+        print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
+        }
+        return ($self->{'default_language'}, $self->{'default_encoding'});
+    }
+    }
+    # format language/encoding
+    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
+    $language = $iso639::toiso639{lc($language)};
+    die "Invalid language\n" if !defined $language;
+    if (!defined $encoding) {
+    # if textcat returned no encoding info it is assumed to be iso_8859_1
+    $encoding = "iso_8859_1";
+    } else {
+    # convert to the format we expect
+    $encoding =~ s/windows/windows_/;
+    $encoding =~ s/iso8859/iso_8859/;
+    $encoding =~ s/^gb.*$/gb/;
+    }
+    if (!defined $supported_encodings{$encoding}) {
+    if ($self->{'verbosity'}) {
+        print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
+        print $outhandle "using $self->{'default_encoding'}\n";
+    }
+    $encoding = $self->{'default_encoding'};
+    }
+    return ($language, $encoding);
+}
 …
     print $outhandle " extracting email addresses ...\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
 …
         $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
         print $outhandle "  extracting $address\n"
         if ($self->{'verbosity'} >= 3);
+        if ($self->{'verbosity'} > 3);
+    }
+    }
     print $outhandle " done extracting email addresses.\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
+}
 …
+}
-# Identify the language of a section and add it to the metadata
-sub extract_language {
-    my $self = shift (@_);
-    my ($textref, $doc_obj, $thissection) = @_;
-    # remove all HTML tags
-    my $text = $$textref;
-    $text =~ s/<P[^>]*>/\n/sgi;
-    $text =~ s/<H[^>]*>/\n/sgi;
-    $text =~ s/<[^>]*>//sgi;
-    $text =~ tr/\n/\n/s;
-    # get the language
-    my @results = textcat::classify($text);
-    @results = ("unknown") if ($#results > 2);
-    # create language string and remove encoding information
-    my $language = join(" or ", @results);
-    $language =~ s/\-\w+//g;
-    $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
-    # print "Language: ", time, "-> $language\n";
+}
 # extract acronyms from a section in a document. progress is
 # reported to outhandle based on the verbosity. both the Acronym
 …
     print $outhandle " extracting acronyms ...\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     my $acro_array =  &acronym::acronyms($textref);
 …
         $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
         print $outhandle "  adding ". $acro->to_string() . "\n"
         if ($self->{'verbosity'} >= 3);
+        if ($self->{'verbosity'} > 3);
+    }
+    }
     print $outhandle " done extracting acronyms. \n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
+}
 …
     print $outhandle " marking up acronyms ...\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     #self is passed in to check for verbosity ...
 …
     print $outhandle " done marking up acronyms. \n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     return $text;
 …
 ;

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r1699
+              r1844
             $value =~ s/\"$//;
             $value =~ s/\s+/ /gs;
-            print "adding Creator of $value\n";
             $doc_obj->add_utf8_metadata($section, "Creator", $value);
             print $outhandle " extracted Creator metadata \"$value\"\n"
                 if ($self->{'verbosity'} >= 2);
+                if ($self->{'verbosity'} > 2);
             next;
+            }
 …
             $doc_obj->add_utf8_metadata($section, $field, $value);
             print $outhandle " extracted \"$field\" metadata \"$value\"\n"
             if ($self->{'verbosity'} >= 2);
+            if ($self->{'verbosity'} > 2);
             next;
+        }
 …
             $doc_obj->add_utf8_metadata ($section, $field, $title);
             print $outhandle " extracted \"$field\" metadata \"$title\"\n"
                 if ($self->{'verbosity'} >= 2);
+                if ($self->{'verbosity'} > 2);
             next;
+            }
 …
         $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
         print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
         if ($self->{'verbosity'} >= 2);
+        if ($self->{'verbosity'} > 2);
         next;
+    }
 …
             $doc_obj->add_utf8_metadata ($section, $tag, $word);
             print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
                 if ($self->{'verbosity'} >= 2);
+                if ($self->{'verbosity'} > 2);
+            }
+        }

trunk/gsdl/perllib/unicode.pm

-              r1227
+              r1844
+}
+# arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
+# and returns a unicode array
+sub arabic2unicode {
+    my ($in) = @_;
+    my $out = [];
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c += (1567-191) if ($c >= 0x80);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
+# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
 # and returns a unicode array. These encodings are similar to but not
 # identical to the corresponding ISO-8859 encodings.
+#
+# $encoding should be the code page name (e.g. '1252')
+#
 # The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
 …
     my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
                       "WINDOWS", "$encoding.TXT");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
+# and returns a unicode array. This function is much like windows2unicode()
+# except that only characters >= 0xA0 are read from the mapping file (since
+# all characters below that are the same for all iso-8859 character sets
+# and therefore already the same as unicode).
+#
+# Note that while this function will work for iso-8859-1 (latin 1) it'll be
+# much faster to use ascii2unicode() or ascii2utf8()
+#
+# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
+# encoding is
+#
+# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
+sub iso2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "ISO_8859", "$encoding.TXT");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# cyrillic2unicode is basically identical to windows2unicode, the only
+# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
+#
+# values for $encoding may be 'koi8_r' or 'koi8_u'
+sub cyrillic2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "CYRILLIC", "$encoding.txt");
     return $out unless &loadmapping ($encoding, $mapfile);

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: