Changeset 2008 for trunk/gsdl


Ignore:
Timestamp:
2001-02-19T12:22:02+13:00 (23 years ago)
Author:
paynter
Message:

Marginally better support for non-English documents.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1949 r2008  
    242242    my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
    243243    my $phrlanguage = $self->{'language_exp'};
     244
     245    print STDERR "+ CLASSIFY - doclanguage: $doclanguage, phrlanguage $phrlanguage \n";
     246
    244247    return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
    245248   
     
    432435    my ($language_exp, $text) = @_;
    433436
     437    print STDERR "+ tokenising in $language_exp\n";
     438   
    434439    if ($language_exp =~ /en/) {
    435440    return &convert_gml_to_tokens_EN($text);
    436441    }
    437442
    438     # FIRST, remove GML tags
    439443    $_ = $text;
    440444
    441     # Replace all whitespace with a simple space
    442     s/\s+/ /gso;
     445    # 1. remove GML tags
    443446
    444447    # Remove everything that is in a tag
     
    449452    # Now we have the text, but it may contain HTML
    450453    # elements coded as > etc.  Remove these tags.
     454    s/&/&/sgo;
    451455    s/&lt;/</sgo;
    452456    s/&gt;/>/sgo;
    453 
    454     s/\s+/ /sgo;
    455457    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
    456458    s/\s*<br>\s*/ LINEBREAK /isgo;
    457459    s/<[^>]*>/ /sgo;
    458460
    459     # remove &amp; and other miscellaneous markup tags
    460     s/&amp;/&/sgo;
    461     s/&lt;/</sgo;
    462     s/&gt;/>/sgo;
    463     s/&amp;/&/sgo;
    464 
    465     # replace<p> and <br> placeholders with carriage returns
     461    # replace<p> and <br> placeholders with clause break symbol (\n)
     462    s/\s+/ /gso;
    466463    s/PARAGRAPHBREAK/\n/sgo;
    467464    s/LINEBREAK/\n/sgo;
    468465
    469466   
    470     s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse;
    471 
    472 
    473     # Convert the remaining text to "clause format",
    474 
    475     # This means removing all excess punctuation and garbage text,
    476     # normalising valid punctuation to fullstops and commas,
    477     # then putting one clause on each line.
    478 
    479     # Insert newline when the end of a sentence is detected
     467
     468
     469    # 2. Split the remaining text into space-delimited tokens
     470
     471    # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
     472    s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
     473
     474    # Split text at word boundaries
     475    s/\b/ /go;
     476
     477    # 3. Convert the remaining text to "clause format"
     478
     479    # Insert newline if the end of a sentence is detected
    480480    # (delimter is:  "[\.\?\!]\s")
    481     s/\s*[\.\?\!]\s+/\n/go;
    482 
    483     # split numbers after four digits
    484     s/(\d\d\d\d)/$1 /go;
    485    
    486     # remove extra whitespace
     481    # s/\s*[\.\?\!]\s+/\n/go;
     482
     483    # remove unnecessary punctuation and replace with clause break symbol (\n)
     484    s/[^\w ]/\n/go;
     485
     486    # remove extraneous whitespace
    487487    s/ +/ /sgo;
    488488    s/^\s+//mgo;
Note: See TracChangeset for help on using the changeset viewer.