Changeset 1890
- Timestamp:
- 2001-01-31T17:45:38+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1883 r1890 27 27 ########################################################################### 28 28 29 # The phind clasifier plugin 30 # 31 # options are: 32 # -button Name The label for the classifiers button in the 33 # navigation bar (defaults to "Phrase"). 34 # -title Title The metadata field used to describe each document 35 # (defaults to "Title"). 36 # -text fields The text used to build the phrase hierarchy 37 # (defaults to "section:Title,section:text"). 38 # -phinddir directory Location of phind index files 39 # -verbosity num Control amount of output 40 # -untidy Do not clean up intermediate files 41 # -suffixmode num Mode of suffix program (0 = all phrases, 1 = stopword) 42 # -savephrases filename If set, phrase infomation will be stored in filename 43 # as text. (By defualt, it is not set.) 44 # -thesaurus name Name of a thesaurus stred in phind format in etc dir. 45 46 # How a classifier works. 47 # 48 # For each classifier requested in the collect.cfg file, buildcol.pl creates 49 # a new classifier object (such as the one defined in theis file) and later 50 # passes each document object to the classifier in turn for classification. 51 # 52 # Four functions are used: 53 # 54 # 1. "new" is called before the documents are processed to set up the 55 # classifier. 56 # 57 # 2. "init" is called after buildcol.pl has created the indexes etc but 58 # before the documents are classified in order that the classifier might 59 # set any variables it requires, etc. 60 # 61 # 3. "classify" is called once for each document object. The classifier 62 # "classifies" each document and updates its local data accordingly. 63 # 64 # 4. "get_classify_info" is called after every document has been 65 # classified. It collates the information about the documents and 66 # stores a reference to the classifier so that Greenstone can later 67 # display it. 68 29 # The phind clasifier plugin. 30 # Options are dexcribed in the print_usage function. 31 # Type "classinfo.pl phind" at the command line for a summary. 69 32 70 33 package phind; … … 72 35 use BasClas; 73 36 use util; 37 74 38 75 39 sub BEGIN { 76 40 @ISA = ('BasClas'); 77 41 } 78 79 # Define delimiter symbols - this should be abstracted out someplace80 my $colstart = "COLLECTIONSTART";81 my $colend = "COLLECTIONEND";82 my $doclimit = "DOCUMENTLIMIT";83 my $senlimit = "SENTENCELIMIT";84 my @delimiters = ($colstart, $colend, $doclimit, $senlimit);85 42 86 43 … … 124 81 125 82 "; } 83 84 85 # Phrase delimiter symbols - these should be abstracted out someplace 86 87 my $colstart = "COLLECTIONSTART"; 88 my $colend = "COLLECTIONEND"; 89 my $doclimit = "DOCUMENTLIMIT"; 90 my $senlimit = "SENTENCELIMIT"; 91 my @delimiters = ($colstart, $colend, $doclimit, $senlimit); 92 126 93 127 94 # Create a new phind browser based on collect.cfg … … 252 219 } 253 220 221 254 222 # Classify each document. 255 223 # … … 295 263 my $indexes = $self->{'indexes'}; 296 264 my $text = ""; 297 my ($part, $level, $field, $section, $data );265 my ($part, $level, $field, $section, $data, $dataref); 298 266 299 267 foreach $part (split(/,/, $indexes)) { … … 316 284 317 285 # Extract a metadata field from a document 286 # (If ther eis more than one element of the given type, get them all.) 318 287 elsif ($level eq "document") { 319 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 320 $text .= convert_gml_to_tokens($data) . "\n"; 321 } 322 288 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field); 289 foreach $data ($$dataref) { 290 $text .= convert_gml_to_tokens($data) . "\n"; 291 } 292 } 293 323 294 # Extract metadata from every section in a document 324 295 elsif ($level eq "section") { … … 326 297 $section = $doc_obj->get_top_section(); 327 298 while (defined($section)) { 328 $data .= $doc_obj->get_metadata_element($section, $field) . "\n"; 299 $dataref .= $doc_obj->get_metadata($section, $field); 300 $data .= join("\n", $$dataref) . "\n"; 329 301 $section = $doc_obj->get_next_section($section); 330 302 } … … 342 314 $text =~ tr/\n//s; 343 315 print $txthandle "$text"; 344 345 316 } 346 347 317 348 318 … … 459 429 sub convert_gml_to_tokens { 460 430 461 $_ = shift @_; 431 my ($text) = @_; 432 my $language_exp = $self->{'language_exp'}; 462 433 463 434 # FIRST, remove GML tags 435 $_ = $text; 464 436 465 437 # Replace all whitespace with a simple space 466 s/\s+/ /gs ;438 s/\s+/ /gso; 467 439 468 440 # Remove everything that is in a tag 469 s/\s*<p>\s*/ PARAGRAPHBREAK /isg ;470 s/\s*<br>\s*/ LINEBREAK /isg ;471 s/<[^>]*>/ /sg ;441 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo; 442 s/\s*<br>\s*/ LINEBREAK /isgo; 443 s/<[^>]*>/ /sgo; 472 444 473 445 # Now we have the text, but it may contain HTML 474 446 # elements coded as > etc. Remove these tags. 475 s/</</sg ;476 s/>/>/sg ;477 478 s/\s+/ /sg ;479 s/\s*<p>\s*/ PARAGRAPHBREAK /isg ;480 s/\s*<br>\s*/ LINEBREAK /isg ;481 s/<[^>]*>/ /sg ;447 s/</</sgo; 448 s/>/>/sgo; 449 450 s/\s+/ /sgo; 451 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo; 452 s/\s*<br>\s*/ LINEBREAK /isgo; 453 s/<[^>]*>/ /sgo; 482 454 483 455 # remove & and other miscellaneous markup tags 484 s/&/&/sg ;485 s/</</sg ;486 s/>/>/sg ;487 s/&/&/sg ;456 s/&/&/sgo; 457 s/</</sgo; 458 s/>/>/sgo; 459 s/&/&/sgo; 488 460 489 461 # replace<p> and <br> placeholders with carriage returns 490 s/PARAGRAPHBREAK/\n/sg; 491 s/LINEBREAK/\n/sg; 492 493 494 # Exceptional punctuation 495 # 496 # We make special cases of some punctuation 497 498 # remove any apostrophe that indicates omitted letters 499 s/(\w+)\'(\w*\s)/ $1$2 /g; 500 501 # remove period that appears in a person's initals 502 s/\s([A-Z])\./ $1 /g; 503 504 # replace hyphens in hypheanted words and names with a space 505 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 462 s/PARAGRAPHBREAK/\n/sgo; 463 s/LINEBREAK/\n/sgo; 464 465 $text = $_; 466 467 468 # Language-specific word-cleanup 469 470 # English 471 if ($language_exp =~ /en/) { 472 473 # remove any apostrophe that indicates omitted letters 474 $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g; 475 476 # remove period that appears in a person's initals 477 $text =~ s/\s([A-Z])\./ $1 /g; 478 479 # replace hyphens in hyphenated words and names with a space 480 $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 481 482 } 506 483 507 484 508 485 # Convert the remaining text to "clause format", 486 509 487 # This means removing all excess punctuation and garbage text, 510 488 # normalising valid punctuation to fullstops and commas, 511 # then putting one cl use on each line.489 # then putting one clause on each line. 512 490 513 491 # Insert newline when the end of a sentence is detected 514 492 # (delimter is: "[\.\?\!]\s") 515 s/\s*[\.\?\!]\s+/\n/g; 516 517 # split numbers after four digits 518 s/(\d\d\d\d)/$1 /g; 519 520 # split words after 32 characters 521 522 # squash repeated punctuation 523 tr/A-Za-z0-9 //cs; 524 525 # save email addresses 526 # s/\w+@\w+\.[\w\.]+/EMAIL/g; 527 528 # normalise clause breaks (mostly punctuation symbols) to commas 529 s/[^A-Za-z0-9 \n]+/ , /g; 530 531 # Remove repeated commas, and replace with newline 532 s/\s*,[, ]+/\n/g; 493 $text =~ s/\s*[\.\?\!]\s+/\n/go; 494 495 496 # Language-specific clause clean-up 497 498 # English 499 if ($language_exp =~ /en/) { 500 501 # split numbers after four digits 502 $text =~ s/(\d\d\d\d)/$1 /g; 503 504 # split words after 32 characters 505 506 # squash repeated punctuation 507 $text =~ tr/A-Za-z0-9 //cs; 508 509 # normalise clause breaks (mostly punctuation symbols) to commas 510 $text =~ s/[^A-Za-z0-9 \n]+/ , /g; 511 512 # Remove repeated commas, and replace with newline 513 $text =~ s/\s*,[, ]+/\n/g; 514 } 533 515 534 516 # remove extra whitespace 535 s/ +/ /sg; 536 s/^\s+//mg; 537 s/\s*$/\n/mg; 517 $_ = $text; 518 s/ +/ /sgo; 519 s/^\s+//mgo; 520 s/\s*$/\n/mgo; 538 521 539 522 # remove lines that contain one word or less 540 s/^\ w*$//mg;541 s/^\s*$//mg ;523 s/^\S*$//mgo; 524 s/^\s*$//mgo; 542 525 tr/\n//s; 543 526
Note:
See TracChangeset
for help on using the changeset viewer.