Changeset 1897


Ignore:
Timestamp:
2001-02-02T10:02:41+13:00 (23 years ago)
Author:
paynter
Message:

Convert_gml_into_tokens function a little more language tolerant,
and the thesaurus appriate to the classifier's language is used
when multiple languages are available.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1890 r1897  
    3535use BasClas;
    3636use util;
    37 
     37use ghtml;
     38use unicode;
    3839
    3940sub BEGIN {
     
    432433    my $language_exp = $self->{'language_exp'};
    433434
     435    if ($language_exp =~ /en/) {
     436    return convert_gml_to_tokens_EN($text);
     437    }
     438
    434439    # FIRST, remove GML tags
    435440    $_ = $text;
     
    463468    s/LINEBREAK/\n/sgo;
    464469
    465     $text = $_;
    466    
    467    
    468     # Language-specific word-cleanup
    469 
    470     # English
    471     if ($language_exp =~ /en/) {
    472 
    473     # remove any apostrophe that indicates omitted letters
    474     $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g;
    475 
    476     # remove period that appears in a person's initals
    477     $text =~ s/\s([A-Z])\./ $1 /g;
    478 
    479     # replace hyphens in hyphenated words and names with a space
    480     $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
    481 
    482     }
     470   
     471    s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse;
    483472
    484473
     
    491480    # Insert newline when the end of a sentence is detected
    492481    # (delimter is:  "[\.\?\!]\s")
    493     $text =~ s/\s*[\.\?\!]\s+/\n/go;
    494 
    495 
    496     # Language-specific clause clean-up
    497 
    498     # English
    499     if ($language_exp =~ /en/) {
    500 
    501     # split numbers after four digits
    502     $text =~ s/(\d\d\d\d)/$1 /g;
    503 
    504     # split words after 32 characters
    505 
    506     # squash repeated punctuation
    507     $text =~ tr/A-Za-z0-9 //cs;
    508 
    509     # normalise clause breaks (mostly punctuation symbols) to commas
    510     $text =~ s/[^A-Za-z0-9 \n]+/ , /g;
    511 
    512     # Remove repeated commas, and replace with newline
    513     $text =~ s/\s*,[, ]+/\n/g;
    514     }
    515 
     482    s/\s*[\.\?\!]\s+/\n/go;
     483
     484    # split numbers after four digits
     485    s/(\d\d\d\d)/$1 /go;
     486   
    516487    # remove extra whitespace
    517     $_ = $text;
    518488    s/ +/ /sgo;
    519489    s/^\s+//mgo;
     
    527497    return $_;
    528498}
     499
     500# A version of convert_gml_to_tokens that is fine-tuned to the English language.
     501
     502sub convert_gml_to_tokens_EN {
     503    $_ = shift @_;
     504
     505    # FIRST, remove GML tags
     506
     507    # Replace all whitespace with a simple space
     508    s/\s+/ /gs;
     509
     510    # Remove everything that is in a tag
     511    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
     512    s/\s*<br>\s*/ LINEBREAK /isg;
     513    s/<[^>]*>/ /sg;
     514
     515    # Now we have the text, but it may contain HTML
     516    # elements coded as &gt; etc.  Remove these tags.
     517    s/&lt;/</sg;
     518    s/&gt;/>/sg;
     519
     520    s/\s+/ /sg;
     521    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
     522    s/\s*<br>\s*/ LINEBREAK /isg;
     523    s/<[^>]*>/ /sg;
     524
     525    # remove &amp; and other miscellaneous markup tags
     526    s/&amp;/&/sg;
     527    s/&lt;/</sg;
     528    s/&gt;/>/sg;
     529    s/&amp;/&/sg;
     530
     531    # replace<p> and <br> placeholders with carriage returns
     532    s/PARAGRAPHBREAK/\n/sg;
     533    s/LINEBREAK/\n/sg;
     534
     535
     536    # Exceptional punctuation
     537    #
     538    # We make special cases of some punctuation
     539
     540    # remove any apostrophe that indicates omitted letters
     541    s/(\w+)\'(\w*\s)/ $1$2 /g;
     542
     543    # remove period that appears in a person's initals
     544    s/\s([A-Z])\./ $1 /g;
     545
     546    # replace hyphens in hypheanted words and names with a space
     547    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
     548
     549    # Convert the remaining text to "clause format",
     550    # This means removing all excess punctuation and garbage text,
     551    # normalising valid punctuation to fullstops and commas,
     552    # then putting one cluse on each line.
     553
     554    # Insert newline when the end of a sentence is detected
     555    # (delimter is:  "[\.\?\!]\s")
     556    s/\s*[\.\?\!]\s+/\n/g;
     557
     558    # split numbers after four digits
     559    s/(\d\d\d\d)/$1 /g;
     560
     561    # split words after 32 characters
     562
     563    # squash repeated punctuation
     564    tr/A-Za-z0-9 //cs;
     565
     566    # save email addresses
     567    # s/\w+@\w+\.[\w\.]+/EMAIL/g;
     568
     569    # normalise clause breaks (mostly punctuation symbols) to commas
     570    s/[^A-Za-z0-9 \n]+/ , /g;
     571
     572    # Remove repeated commas, and replace with newline
     573    s/\s*,[, ]+/\n/g;
     574
     575    # remove extra whitespace
     576    s/ +/ /sg;
     577    s/^\s+//mg;
     578    s/\s*$/\n/mg;
     579
     580    # remove lines that contain one word or less
     581    s/^\w*$//mg;
     582    s/^\s*$//mg;
     583    tr/\n//s;
     584
     585    return $_;
     586
     587}
     588
    529589
    530590
     
    614674    if ($thesaurus) {
    615675
    616     # Ensure both link and term files exist
     676    # link file exists
    617677    $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
    618678    die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
    619     $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
     679
     680    # ensure term file exists in the correct language
     681    if ($language_exp =~ /^([a-z][a-z])/) {
     682        $language = $1;
     683    } else {
     684        $language = 'en';
     685    }
     686    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
    620687    die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
     688   
    621689
    622690    # Read the thesaurus terms
Note: See TracChangeset for help on using the changeset viewer.