Changeset 2008 for trunk/gsdl
- Timestamp:
- 2001-02-19T12:22:02+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1949 r2008 242 242 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language"); 243 243 my $phrlanguage = $self->{'language_exp'}; 244 245 print STDERR "+ CLASSIFY - doclanguage: $doclanguage, phrlanguage $phrlanguage \n"; 246 244 247 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i)); 245 248 … … 432 435 my ($language_exp, $text) = @_; 433 436 437 print STDERR "+ tokenising in $language_exp\n"; 438 434 439 if ($language_exp =~ /en/) { 435 440 return &convert_gml_to_tokens_EN($text); 436 441 } 437 442 438 # FIRST, remove GML tags439 443 $_ = $text; 440 444 441 # Replace all whitespace with a simple space 442 s/\s+/ /gso; 445 # 1. remove GML tags 443 446 444 447 # Remove everything that is in a tag … … 449 452 # Now we have the text, but it may contain HTML 450 453 # elements coded as > etc. Remove these tags. 454 s/&/&/sgo; 451 455 s/</</sgo; 452 456 s/>/>/sgo; 453 454 s/\s+/ /sgo;455 457 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo; 456 458 s/\s*<br>\s*/ LINEBREAK /isgo; 457 459 s/<[^>]*>/ /sgo; 458 460 459 # remove & and other miscellaneous markup tags 460 s/&/&/sgo; 461 s/</</sgo; 462 s/>/>/sgo; 463 s/&/&/sgo; 464 465 # replace<p> and <br> placeholders with carriage returns 461 # replace<p> and <br> placeholders with clause break symbol (\n) 462 s/\s+/ /gso; 466 463 s/PARAGRAPHBREAK/\n/sgo; 467 464 s/LINEBREAK/\n/sgo; 468 465 469 466 470 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse; 471 472 473 # Convert the remaining text to "clause format", 474 475 # This means removing all excess punctuation and garbage text, 476 # normalising valid punctuation to fullstops and commas, 477 # then putting one clause on each line. 478 479 # Insert newline when the end of a sentence is detected 467 468 469 # 2. Split the remaining text into space-delimited tokens 470 471 # Convert any HTML special characters (like ") to their UTF8 equivalent 472 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse; 473 474 # Split text at word boundaries 475 s/\b/ /go; 476 477 # 3. Convert the remaining text to "clause format" 478 479 # Insert newline if the end of a sentence is detected 480 480 # (delimter is: "[\.\?\!]\s") 481 s/\s*[\.\?\!]\s+/\n/go;482 483 # split numbers after four digits484 s/ (\d\d\d\d)/$1/go;485 486 # remove extra whitespace481 # s/\s*[\.\?\!]\s+/\n/go; 482 483 # remove unnecessary punctuation and replace with clause break symbol (\n) 484 s/[^\w ]/\n/go; 485 486 # remove extraneous whitespace 487 487 s/ +/ /sgo; 488 488 s/^\s+//mgo;
Note:
See TracChangeset
for help on using the changeset viewer.