Changeset 1897
- Timestamp:
- 2001-02-02T10:02:41+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1890 r1897 35 35 use BasClas; 36 36 use util; 37 37 use ghtml; 38 use unicode; 38 39 39 40 sub BEGIN { … … 432 433 my $language_exp = $self->{'language_exp'}; 433 434 435 if ($language_exp =~ /en/) { 436 return convert_gml_to_tokens_EN($text); 437 } 438 434 439 # FIRST, remove GML tags 435 440 $_ = $text; … … 463 468 s/LINEBREAK/\n/sgo; 464 469 465 $text = $_; 466 467 468 # Language-specific word-cleanup 469 470 # English 471 if ($language_exp =~ /en/) { 472 473 # remove any apostrophe that indicates omitted letters 474 $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g; 475 476 # remove period that appears in a person's initals 477 $text =~ s/\s([A-Z])\./ $1 /g; 478 479 # replace hyphens in hyphenated words and names with a space 480 $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 481 482 } 470 471 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse; 483 472 484 473 … … 491 480 # Insert newline when the end of a sentence is detected 492 481 # (delimter is: "[\.\?\!]\s") 493 $text =~ s/\s*[\.\?\!]\s+/\n/go; 494 495 496 # Language-specific clause clean-up 497 498 # English 499 if ($language_exp =~ /en/) { 500 501 # split numbers after four digits 502 $text =~ s/(\d\d\d\d)/$1 /g; 503 504 # split words after 32 characters 505 506 # squash repeated punctuation 507 $text =~ tr/A-Za-z0-9 //cs; 508 509 # normalise clause breaks (mostly punctuation symbols) to commas 510 $text =~ s/[^A-Za-z0-9 \n]+/ , /g; 511 512 # Remove repeated commas, and replace with newline 513 $text =~ s/\s*,[, ]+/\n/g; 514 } 515 482 s/\s*[\.\?\!]\s+/\n/go; 483 484 # split numbers after four digits 485 s/(\d\d\d\d)/$1 /go; 486 516 487 # remove extra whitespace 517 $_ = $text;518 488 s/ +/ /sgo; 519 489 s/^\s+//mgo; … … 527 497 return $_; 528 498 } 499 500 # A version of convert_gml_to_tokens that is fine-tuned to the English language. 501 502 sub convert_gml_to_tokens_EN { 503 $_ = shift @_; 504 505 # FIRST, remove GML tags 506 507 # Replace all whitespace with a simple space 508 s/\s+/ /gs; 509 510 # Remove everything that is in a tag 511 s/\s*<p>\s*/ PARAGRAPHBREAK /isg; 512 s/\s*<br>\s*/ LINEBREAK /isg; 513 s/<[^>]*>/ /sg; 514 515 # Now we have the text, but it may contain HTML 516 # elements coded as > etc. Remove these tags. 517 s/</</sg; 518 s/>/>/sg; 519 520 s/\s+/ /sg; 521 s/\s*<p>\s*/ PARAGRAPHBREAK /isg; 522 s/\s*<br>\s*/ LINEBREAK /isg; 523 s/<[^>]*>/ /sg; 524 525 # remove & and other miscellaneous markup tags 526 s/&/&/sg; 527 s/</</sg; 528 s/>/>/sg; 529 s/&/&/sg; 530 531 # replace<p> and <br> placeholders with carriage returns 532 s/PARAGRAPHBREAK/\n/sg; 533 s/LINEBREAK/\n/sg; 534 535 536 # Exceptional punctuation 537 # 538 # We make special cases of some punctuation 539 540 # remove any apostrophe that indicates omitted letters 541 s/(\w+)\'(\w*\s)/ $1$2 /g; 542 543 # remove period that appears in a person's initals 544 s/\s([A-Z])\./ $1 /g; 545 546 # replace hyphens in hypheanted words and names with a space 547 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 548 549 # Convert the remaining text to "clause format", 550 # This means removing all excess punctuation and garbage text, 551 # normalising valid punctuation to fullstops and commas, 552 # then putting one cluse on each line. 553 554 # Insert newline when the end of a sentence is detected 555 # (delimter is: "[\.\?\!]\s") 556 s/\s*[\.\?\!]\s+/\n/g; 557 558 # split numbers after four digits 559 s/(\d\d\d\d)/$1 /g; 560 561 # split words after 32 characters 562 563 # squash repeated punctuation 564 tr/A-Za-z0-9 //cs; 565 566 # save email addresses 567 # s/\w+@\w+\.[\w\.]+/EMAIL/g; 568 569 # normalise clause breaks (mostly punctuation symbols) to commas 570 s/[^A-Za-z0-9 \n]+/ , /g; 571 572 # Remove repeated commas, and replace with newline 573 s/\s*,[, ]+/\n/g; 574 575 # remove extra whitespace 576 s/ +/ /sg; 577 s/^\s+//mg; 578 s/\s*$/\n/mg; 579 580 # remove lines that contain one word or less 581 s/^\w*$//mg; 582 s/^\s*$//mg; 583 tr/\n//s; 584 585 return $_; 586 587 } 588 529 589 530 590 … … 614 674 if ($thesaurus) { 615 675 616 # Ensure both link and term files exist676 # link file exists 617 677 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk"); 618 678 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links"); 619 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN"); 679 680 # ensure term file exists in the correct language 681 if ($language_exp =~ /^([a-z][a-z])/) { 682 $language = $1; 683 } else { 684 $language = 'en'; 685 } 686 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language"); 620 687 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms"); 688 621 689 622 690 # Read the thesaurus terms
Note:
See TracChangeset
for help on using the changeset viewer.