Changeset 1897

Show
Ignore:
Timestamp:
02.02.2001 10:02:41 (19 years ago)
Author:
paynter
Message:

Convert_gml_into_tokens function a little more language tolerant,
and the thesaurus appriate to the classifier's language is used
when multiple languages are available.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1890 r1897  
    3535use BasClas; 
    3636use util; 
    37  
     37use ghtml; 
     38use unicode; 
    3839 
    3940sub BEGIN { 
     
    432433    my $language_exp = $self->{'language_exp'}; 
    433434 
     435    if ($language_exp =~ /en/) { 
     436    return convert_gml_to_tokens_EN($text); 
     437    } 
     438 
    434439    # FIRST, remove GML tags 
    435440    $_ = $text; 
     
    463468    s/LINEBREAK/\n/sgo; 
    464469 
    465     $text = $_; 
    466      
    467      
    468     # Language-specific word-cleanup 
    469  
    470     # English 
    471     if ($language_exp =~ /en/) { 
    472  
    473     # remove any apostrophe that indicates omitted letters  
    474     $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g; 
    475  
    476     # remove period that appears in a person's initals 
    477     $text =~ s/\s([A-Z])\./ $1 /g; 
    478  
    479     # replace hyphens in hyphenated words and names with a space 
    480     $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 
    481  
    482     } 
     470     
     471    s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse; 
    483472 
    484473 
     
    491480    # Insert newline when the end of a sentence is detected 
    492481    # (delimter is:  "[\.\?\!]\s") 
    493     $text =~ s/\s*[\.\?\!]\s+/\n/go;  
    494  
    495  
    496     # Language-specific clause clean-up 
    497  
    498     # English 
    499     if ($language_exp =~ /en/) { 
    500  
    501     # split numbers after four digits 
    502     $text =~ s/(\d\d\d\d)/$1 /g; 
    503  
    504     # split words after 32 characters 
    505  
    506     # squash repeated punctuation  
    507     $text =~ tr/A-Za-z0-9 //cs; 
    508  
    509     # normalise clause breaks (mostly punctuation symbols) to commas 
    510     $text =~ s/[^A-Za-z0-9 \n]+/ , /g; 
    511  
    512     # Remove repeated commas, and replace with newline 
    513     $text =~ s/\s*,[, ]+/\n/g; 
    514     } 
    515  
     482    s/\s*[\.\?\!]\s+/\n/go;  
     483 
     484    # split numbers after four digits 
     485    s/(\d\d\d\d)/$1 /go; 
     486     
    516487    # remove extra whitespace 
    517     $_ = $text; 
    518488    s/ +/ /sgo; 
    519489    s/^\s+//mgo; 
     
    527497    return $_; 
    528498} 
     499 
     500# A version of convert_gml_to_tokens that is fine-tuned to the English language. 
     501 
     502sub convert_gml_to_tokens_EN { 
     503    $_ = shift @_; 
     504 
     505    # FIRST, remove GML tags 
     506 
     507    # Replace all whitespace with a simple space 
     508    s/\s+/ /gs; 
     509 
     510    # Remove everything that is in a tag 
     511    s/\s*<p>\s*/ PARAGRAPHBREAK /isg; 
     512    s/\s*<br>\s*/ LINEBREAK /isg; 
     513    s/<[^>]*>/ /sg; 
     514 
     515    # Now we have the text, but it may contain HTML  
     516    # elements coded as &gt; etc.  Remove these tags.  
     517    s/&lt;/</sg; 
     518    s/&gt;/>/sg; 
     519 
     520    s/\s+/ /sg; 
     521    s/\s*<p>\s*/ PARAGRAPHBREAK /isg; 
     522    s/\s*<br>\s*/ LINEBREAK /isg; 
     523    s/<[^>]*>/ /sg; 
     524 
     525    # remove &amp; and other miscellaneous markup tags 
     526    s/&amp;/&/sg; 
     527    s/&lt;/</sg; 
     528    s/&gt;/>/sg; 
     529    s/&amp;/&/sg; 
     530 
     531    # replace<p> and <br> placeholders with carriage returns 
     532    s/PARAGRAPHBREAK/\n/sg; 
     533    s/LINEBREAK/\n/sg; 
     534 
     535 
     536    # Exceptional punctuation 
     537    #  
     538    # We make special cases of some punctuation 
     539 
     540    # remove any apostrophe that indicates omitted letters  
     541    s/(\w+)\'(\w*\s)/ $1$2 /g; 
     542 
     543    # remove period that appears in a person's initals 
     544    s/\s([A-Z])\./ $1 /g; 
     545 
     546    # replace hyphens in hypheanted words and names with a space 
     547    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 
     548 
     549    # Convert the remaining text to "clause format", 
     550    # This means removing all excess punctuation and garbage text, 
     551    # normalising valid punctuation to fullstops and commas, 
     552    # then putting one cluse on each line. 
     553 
     554    # Insert newline when the end of a sentence is detected 
     555    # (delimter is:  "[\.\?\!]\s") 
     556    s/\s*[\.\?\!]\s+/\n/g;  
     557 
     558    # split numbers after four digits 
     559    s/(\d\d\d\d)/$1 /g; 
     560 
     561    # split words after 32 characters 
     562 
     563    # squash repeated punctuation  
     564    tr/A-Za-z0-9 //cs; 
     565 
     566    # save email addresses 
     567    # s/\w+@\w+\.[\w\.]+/EMAIL/g; 
     568 
     569    # normalise clause breaks (mostly punctuation symbols) to commas 
     570    s/[^A-Za-z0-9 \n]+/ , /g; 
     571 
     572    # Remove repeated commas, and replace with newline 
     573    s/\s*,[, ]+/\n/g; 
     574 
     575    # remove extra whitespace 
     576    s/ +/ /sg; 
     577    s/^\s+//mg; 
     578    s/\s*$/\n/mg; 
     579 
     580    # remove lines that contain one word or less 
     581    s/^\w*$//mg; 
     582    s/^\s*$//mg; 
     583    tr/\n//s; 
     584 
     585    return $_; 
     586 
     587} 
     588 
    529589 
    530590 
     
    614674    if ($thesaurus) { 
    615675 
    616     # Ensure both link and term files exist 
     676    # link file exists 
    617677    $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk"); 
    618678    die "Cannot find thesaurus link file" unless (-e "$thesaurus_links"); 
    619     $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN"); 
     679 
     680    # ensure term file exists in the correct language 
     681    if ($language_exp =~ /^([a-z][a-z])/) { 
     682        $language = $1; 
     683    } else { 
     684        $language = 'en'; 
     685    } 
     686    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language"); 
    620687    die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms"); 
     688     
    621689 
    622690    # Read the thesaurus terms