Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1897

Timestamp:

2001-02-02T10:02:41+13:00 (23 years ago)

Author:

paynter

Message:

Convert_gml_into_tokens function a little more language tolerant,
and the thesaurus appriate to the classifier's language is used
when multiple languages are available.

File:

: 1 edited

trunk/gsdl/perllib/classify/phind.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/classify/phind.pm

-              r1890
+              r1897
 use BasClas;
 use util;
+use ghtml;
+use unicode;
 sub BEGIN {
 …
     my $language_exp = $self->{'language_exp'};
+    if ($language_exp =~ /en/) {
+    return convert_gml_to_tokens_EN($text);
+    }
     # FIRST, remove GML tags
     $_ = $text;
 …
     s/LINEBREAK/\n/sgo;
+    $text = $_;
+    # Language-specific word-cleanup
+    # English
+    if ($language_exp =~ /en/) {
+    # remove any apostrophe that indicates omitted letters
+    $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g;
+    # remove period that appears in a person's initals
+    $text =~ s/\s([A-Z])\./ $1 /g;
+    # replace hyphens in hyphenated words and names with a space
+    $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
+    }
+    s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse;
 …
     # Insert newline when the end of a sentence is detected
     # (delimter is:  "[\.\?\!]\s")
+    $text =~ s/\s*[\.\?\!]\s+/\n/go;
+    # Language-specific clause clean-up
+    # English
+    if ($language_exp =~ /en/) {
+    # split numbers after four digits
+    $text =~ s/(\d\d\d\d)/$1 /g;
+    # split words after 32 characters
+    # squash repeated punctuation
+    $text =~ tr/A-Za-z0-9 //cs;
+    # normalise clause breaks (mostly punctuation symbols) to commas
+    $text =~ s/[^A-Za-z0-9 \n]+/ , /g;
+    # Remove repeated commas, and replace with newline
+    $text =~ s/\s*,[, ]+/\n/g;
+    }
+    s/\s*[\.\?\!]\s+/\n/go;
+    # split numbers after four digits
+    s/(\d\d\d\d)/$1 /go;
     # remove extra whitespace
-    $_ = $text;
     s/ +/ /sgo;
     s/^\s+//mgo;
 …
     return $_;
+}
+# A version of convert_gml_to_tokens that is fine-tuned to the English language.
+sub convert_gml_to_tokens_EN {
+    $_ = shift @_;
+    # FIRST, remove GML tags
+    # Replace all whitespace with a simple space
+    s/\s+/ /gs;
+    # Remove everything that is in a tag
+    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
+    s/\s*<br>\s*/ LINEBREAK /isg;
+    s/<[^>]*>/ /sg;
+    # Now we have the text, but it may contain HTML
+    # elements coded as &gt; etc.  Remove these tags.
+    s/&lt;/</sg;
+    s/&gt;/>/sg;
+    s/\s+/ /sg;
+    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
+    s/\s*<br>\s*/ LINEBREAK /isg;
+    s/<[^>]*>/ /sg;
+    # remove &amp; and other miscellaneous markup tags
+    s/&amp;/&/sg;
+    s/&lt;/</sg;
+    s/&gt;/>/sg;
+    s/&amp;/&/sg;
+    # replace<p> and <br> placeholders with carriage returns
+    s/PARAGRAPHBREAK/\n/sg;
+    s/LINEBREAK/\n/sg;
+    # Exceptional punctuation
+    #
+    # We make special cases of some punctuation
+    # remove any apostrophe that indicates omitted letters
+    s/(\w+)\'(\w*\s)/ $1$2 /g;
+    # remove period that appears in a person's initals
+    s/\s([A-Z])\./ $1 /g;
+    # replace hyphens in hypheanted words and names with a space
+    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
+    # Convert the remaining text to "clause format",
+    # This means removing all excess punctuation and garbage text,
+    # normalising valid punctuation to fullstops and commas,
+    # then putting one cluse on each line.
+    # Insert newline when the end of a sentence is detected
+    # (delimter is:  "[\.\?\!]\s")
+    s/\s*[\.\?\!]\s+/\n/g;
+    # split numbers after four digits
+    s/(\d\d\d\d)/$1 /g;
+    # split words after 32 characters
+    # squash repeated punctuation
+    tr/A-Za-z0-9 //cs;
+    # save email addresses
+    # s/\w+@\w+\.[\w\.]+/EMAIL/g;
+    # normalise clause breaks (mostly punctuation symbols) to commas
+    s/[^A-Za-z0-9 \n]+/ , /g;
+    # Remove repeated commas, and replace with newline
+    s/\s*,[, ]+/\n/g;
+    # remove extra whitespace
+    s/ +/ /sg;
+    s/^\s+//mg;
+    s/\s*$/\n/mg;
+    # remove lines that contain one word or less
+    s/^\w*$//mg;
+    s/^\s*$//mg;
+    tr/\n//s;
+    return $_;
+}
 …
     if ($thesaurus) {
     # Ensure both link and term files exist
+    # link file exists
     $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
     die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
+    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
+    # ensure term file exists in the correct language
+    if ($language_exp =~ /^([a-z][a-z])/) {
+        $language = $1;
+    } else {
+        $language = 'en';
+    }
+    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
     die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
     # Read the thesaurus terms

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1897

Legend:

trunk/gsdl/perllib/classify/phind.pm

Download in other formats: