Changeset 1890

Show
Ignore:
Timestamp:
31.01.2001 17:45:38 (19 years ago)
Author:
paynter
Message:

When multiple metadata fields have multiple values, get them all.
Initial (poor) support for multiple languages (will have to replace).
Some documentation removed.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1883 r1890  
    2727########################################################################### 
    2828 
    29 # The phind clasifier plugin 
    30 # 
    31 # options are: 
    32 #  -button Name           The label for the classifiers button in the 
    33 #                         navigation bar (defaults to "Phrase"). 
    34 #  -title Title           The metadata field used to describe each document 
    35 #                         (defaults to "Title"). 
    36 #  -text fields           The text used to build the phrase hierarchy 
    37 #                         (defaults to "section:Title,section:text"). 
    38 #  -phinddir directory    Location of phind index files 
    39 #  -verbosity num         Control amount of output 
    40 #  -untidy                Do not clean up intermediate files 
    41 #  -suffixmode num        Mode of suffix program (0 = all phrases, 1 = stopword) 
    42 #  -savephrases filename  If set, phrase infomation will be stored in filename 
    43 #                         as text. (By defualt, it is not set.) 
    44 #  -thesaurus name        Name of a thesaurus stred in phind format in etc dir. 
    45  
    46 # How a classifier works.   
    47 # 
    48 # For each classifier requested in the collect.cfg file, buildcol.pl creates 
    49 # a new classifier object (such as the one defined in theis file) and later 
    50 # passes each document object to the classifier in turn for classification. 
    51 # 
    52 # Four functions are used: 
    53 # 
    54 # 1. "new" is called before the documents are processed to set up the 
    55 #    classifier. 
    56 # 
    57 # 2. "init" is called after buildcol.pl has created the indexes etc but 
    58 #    before the documents are classified in order that the classifier might 
    59 #    set any variables it requires, etc. 
    60 # 
    61 # 3. "classify" is called once for each document object.  The classifier 
    62 #    "classifies" each document and updates its local data accordingly. 
    63 # 
    64 # 4. "get_classify_info" is called after every document has been 
    65 #    classified.  It collates the information about the documents and 
    66 #    stores a reference to the classifier so that Greenstone can later 
    67 #    display it. 
    68  
     29# The phind clasifier plugin.  
     30# Options are dexcribed in the print_usage function. 
     31# Type "classinfo.pl phind" at the command line for a summary. 
    6932 
    7033package phind; 
     
    7235use BasClas; 
    7336use util; 
     37 
    7438 
    7539sub BEGIN { 
    7640    @ISA = ('BasClas'); 
    7741} 
    78  
    79 # Define delimiter symbols - this should be abstracted out someplace 
    80 my $colstart = "COLLECTIONSTART"; 
    81 my $colend   = "COLLECTIONEND"; 
    82 my $doclimit = "DOCUMENTLIMIT"; 
    83 my $senlimit = "SENTENCELIMIT"; 
    84 my @delimiters = ($colstart, $colend, $doclimit, $senlimit); 
    8542 
    8643 
     
    12481 
    12582"; } 
     83 
     84 
     85# Phrase delimiter symbols - these should be abstracted out someplace 
     86 
     87my $colstart = "COLLECTIONSTART"; 
     88my $colend   = "COLLECTIONEND"; 
     89my $doclimit = "DOCUMENTLIMIT"; 
     90my $senlimit = "SENTENCELIMIT"; 
     91my @delimiters = ($colstart, $colend, $doclimit, $senlimit); 
     92 
    12693 
    12794# Create a new phind browser based on collect.cfg 
     
    252219} 
    253220 
     221 
    254222# Classify each document. 
    255223# 
     
    295263    my $indexes = $self->{'indexes'}; 
    296264    my $text = ""; 
    297     my ($part, $level, $field, $section, $data); 
     265    my ($part, $level, $field, $section, $data, $dataref); 
    298266 
    299267    foreach $part (split(/,/, $indexes)) { 
     
    316284     
    317285    # Extract a metadata field from a document 
     286    # (If ther eis more than one element of the given type, get them all.) 
    318287    elsif ($level eq "document") { 
    319         $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 
    320         $text .= convert_gml_to_tokens($data) . "\n"; 
    321     }  
    322      
     288        $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field); 
     289        foreach $data ($$dataref) { 
     290        $text .= convert_gml_to_tokens($data) . "\n"; 
     291        }  
     292    } 
     293 
    323294    # Extract metadata from every section in a document 
    324295    elsif ($level eq "section") { 
     
    326297        $section = $doc_obj->get_top_section(); 
    327298        while (defined($section)) { 
    328         $data .= $doc_obj->get_metadata_element($section, $field) . "\n"; 
     299        $dataref .= $doc_obj->get_metadata($section, $field); 
     300        $data .= join("\n", $$dataref) . "\n"; 
    329301        $section = $doc_obj->get_next_section($section); 
    330302        } 
     
    342314    $text =~ tr/\n//s; 
    343315    print $txthandle "$text"; 
    344      
    345316} 
    346  
    347317 
    348318 
     
    459429sub convert_gml_to_tokens { 
    460430     
    461     $_ = shift @_; 
     431    my ($text) = @_; 
     432    my $language_exp = $self->{'language_exp'}; 
    462433 
    463434    # FIRST, remove GML tags 
     435    $_ = $text; 
    464436 
    465437    # Replace all whitespace with a simple space 
    466     s/\s+/ /gs; 
     438    s/\s+/ /gso; 
    467439 
    468440    # Remove everything that is in a tag 
    469     s/\s*<p>\s*/ PARAGRAPHBREAK /isg; 
    470     s/\s*<br>\s*/ LINEBREAK /isg; 
    471     s/<[^>]*>/ /sg; 
     441    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo; 
     442    s/\s*<br>\s*/ LINEBREAK /isgo; 
     443    s/<[^>]*>/ /sgo; 
    472444 
    473445    # Now we have the text, but it may contain HTML  
    474446    # elements coded as &gt; etc.  Remove these tags.  
    475     s/&lt;/</sg; 
    476     s/&gt;/>/sg; 
    477  
    478     s/\s+/ /sg; 
    479     s/\s*<p>\s*/ PARAGRAPHBREAK /isg; 
    480     s/\s*<br>\s*/ LINEBREAK /isg; 
    481     s/<[^>]*>/ /sg; 
     447    s/&lt;/</sgo; 
     448    s/&gt;/>/sgo; 
     449 
     450    s/\s+/ /sgo; 
     451    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo; 
     452    s/\s*<br>\s*/ LINEBREAK /isgo; 
     453    s/<[^>]*>/ /sgo; 
    482454 
    483455    # remove &amp; and other miscellaneous markup tags 
    484     s/&amp;/&/sg; 
    485     s/&lt;/</sg; 
    486     s/&gt;/>/sg; 
    487     s/&amp;/&/sg; 
     456    s/&amp;/&/sgo; 
     457    s/&lt;/</sgo; 
     458    s/&gt;/>/sgo; 
     459    s/&amp;/&/sgo; 
    488460 
    489461    # replace<p> and <br> placeholders with carriage returns 
    490     s/PARAGRAPHBREAK/\n/sg; 
    491     s/LINEBREAK/\n/sg; 
    492  
    493  
    494     # Exceptional punctuation 
    495     #  
    496     # We make special cases of some punctuation 
    497  
    498     # remove any apostrophe that indicates omitted letters  
    499     s/(\w+)\'(\w*\s)/ $1$2 /g; 
    500  
    501     # remove period that appears in a person's initals 
    502     s/\s([A-Z])\./ $1 /g; 
    503  
    504     # replace hyphens in hypheanted words and names with a space 
    505     s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 
     462    s/PARAGRAPHBREAK/\n/sgo; 
     463    s/LINEBREAK/\n/sgo; 
     464 
     465    $text = $_; 
     466     
     467     
     468    # Language-specific word-cleanup 
     469 
     470    # English 
     471    if ($language_exp =~ /en/) { 
     472 
     473    # remove any apostrophe that indicates omitted letters  
     474    $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g; 
     475 
     476    # remove period that appears in a person's initals 
     477    $text =~ s/\s([A-Z])\./ $1 /g; 
     478 
     479    # replace hyphens in hyphenated words and names with a space 
     480    $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; 
     481 
     482    } 
    506483 
    507484 
    508485    # Convert the remaining text to "clause format", 
     486 
    509487    # This means removing all excess punctuation and garbage text, 
    510488    # normalising valid punctuation to fullstops and commas, 
    511     # then putting one cluse on each line. 
     489    # then putting one clause on each line. 
    512490 
    513491    # Insert newline when the end of a sentence is detected 
    514492    # (delimter is:  "[\.\?\!]\s") 
    515     s/\s*[\.\?\!]\s+/\n/g;  
    516  
    517     # split numbers after four digits 
    518     s/(\d\d\d\d)/$1 /g; 
    519  
    520     # split words after 32 characters 
    521  
    522     # squash repeated punctuation  
    523     tr/A-Za-z0-9 //cs; 
    524  
    525     # save email addresses 
    526     # s/\w+@\w+\.[\w\.]+/EMAIL/g; 
    527  
    528     # normalise clause breaks (mostly punctuation symbols) to commas 
    529     s/[^A-Za-z0-9 \n]+/ , /g; 
    530  
    531     # Remove repeated commas, and replace with newline 
    532     s/\s*,[, ]+/\n/g; 
     493    $text =~ s/\s*[\.\?\!]\s+/\n/go;  
     494 
     495 
     496    # Language-specific clause clean-up 
     497 
     498    # English 
     499    if ($language_exp =~ /en/) { 
     500 
     501    # split numbers after four digits 
     502    $text =~ s/(\d\d\d\d)/$1 /g; 
     503 
     504    # split words after 32 characters 
     505 
     506    # squash repeated punctuation  
     507    $text =~ tr/A-Za-z0-9 //cs; 
     508 
     509    # normalise clause breaks (mostly punctuation symbols) to commas 
     510    $text =~ s/[^A-Za-z0-9 \n]+/ , /g; 
     511 
     512    # Remove repeated commas, and replace with newline 
     513    $text =~ s/\s*,[, ]+/\n/g; 
     514    } 
    533515 
    534516    # remove extra whitespace 
    535     s/ +/ /sg; 
    536     s/^\s+//mg; 
    537     s/\s*$/\n/mg; 
     517    $_ = $text; 
     518    s/ +/ /sgo; 
     519    s/^\s+//mgo; 
     520    s/\s*$/\n/mgo; 
    538521 
    539522    # remove lines that contain one word or less 
    540     s/^\w*$//mg; 
    541     s/^\s*$//mg; 
     523    s/^\S*$//mgo; 
     524    s/^\s*$//mgo; 
    542525    tr/\n//s; 
    543526