Changeset 1890


Ignore:
Timestamp:
2001-01-31T17:45:38+13:00 (23 years ago)
Author:
paynter
Message:

When multiple metadata fields have multiple values, get them all.
Initial (poor) support for multiple languages (will have to replace).
Some documentation removed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1883 r1890  
    2727###########################################################################
    2828
    29 # The phind clasifier plugin
    30 #
    31 # options are:
    32 #  -button Name           The label for the classifiers button in the
    33 #                         navigation bar (defaults to "Phrase").
    34 #  -title Title           The metadata field used to describe each document
    35 #                         (defaults to "Title").
    36 #  -text fields           The text used to build the phrase hierarchy
    37 #                         (defaults to "section:Title,section:text").
    38 #  -phinddir directory    Location of phind index files
    39 #  -verbosity num         Control amount of output
    40 #  -untidy                Do not clean up intermediate files
    41 #  -suffixmode num        Mode of suffix program (0 = all phrases, 1 = stopword)
    42 #  -savephrases filename  If set, phrase infomation will be stored in filename
    43 #                         as text. (By defualt, it is not set.)
    44 #  -thesaurus name        Name of a thesaurus stred in phind format in etc dir.
    45 
    46 # How a classifier works. 
    47 #
    48 # For each classifier requested in the collect.cfg file, buildcol.pl creates
    49 # a new classifier object (such as the one defined in theis file) and later
    50 # passes each document object to the classifier in turn for classification.
    51 #
    52 # Four functions are used:
    53 #
    54 # 1. "new" is called before the documents are processed to set up the
    55 #    classifier.
    56 #
    57 # 2. "init" is called after buildcol.pl has created the indexes etc but
    58 #    before the documents are classified in order that the classifier might
    59 #    set any variables it requires, etc.
    60 #
    61 # 3. "classify" is called once for each document object.  The classifier
    62 #    "classifies" each document and updates its local data accordingly.
    63 #
    64 # 4. "get_classify_info" is called after every document has been
    65 #    classified.  It collates the information about the documents and
    66 #    stores a reference to the classifier so that Greenstone can later
    67 #    display it.
    68 
     29# The phind clasifier plugin.
     30# Options are dexcribed in the print_usage function.
     31# Type "classinfo.pl phind" at the command line for a summary.
    6932
    7033package phind;
     
    7235use BasClas;
    7336use util;
     37
    7438
    7539sub BEGIN {
    7640    @ISA = ('BasClas');
    7741}
    78 
    79 # Define delimiter symbols - this should be abstracted out someplace
    80 my $colstart = "COLLECTIONSTART";
    81 my $colend   = "COLLECTIONEND";
    82 my $doclimit = "DOCUMENTLIMIT";
    83 my $senlimit = "SENTENCELIMIT";
    84 my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
    8542
    8643
     
    12481
    12582"; }
     83
     84
     85# Phrase delimiter symbols - these should be abstracted out someplace
     86
     87my $colstart = "COLLECTIONSTART";
     88my $colend   = "COLLECTIONEND";
     89my $doclimit = "DOCUMENTLIMIT";
     90my $senlimit = "SENTENCELIMIT";
     91my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
     92
    12693
    12794# Create a new phind browser based on collect.cfg
     
    252219}
    253220
     221
    254222# Classify each document.
    255223#
     
    295263    my $indexes = $self->{'indexes'};
    296264    my $text = "";
    297     my ($part, $level, $field, $section, $data);
     265    my ($part, $level, $field, $section, $data, $dataref);
    298266
    299267    foreach $part (split(/,/, $indexes)) {
     
    316284   
    317285    # Extract a metadata field from a document
     286    # (If ther eis more than one element of the given type, get them all.)
    318287    elsif ($level eq "document") {
    319         $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
    320         $text .= convert_gml_to_tokens($data) . "\n";
    321     }
    322    
     288        $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
     289        foreach $data ($$dataref) {
     290        $text .= convert_gml_to_tokens($data) . "\n";
     291        }
     292    }
     293
    323294    # Extract metadata from every section in a document
    324295    elsif ($level eq "section") {
     
    326297        $section = $doc_obj->get_top_section();
    327298        while (defined($section)) {
    328         $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
     299        $dataref .= $doc_obj->get_metadata($section, $field);
     300        $data .= join("\n", $$dataref) . "\n";
    329301        $section = $doc_obj->get_next_section($section);
    330302        }
     
    342314    $text =~ tr/\n//s;
    343315    print $txthandle "$text";
    344    
    345316}
    346 
    347317
    348318
     
    459429sub convert_gml_to_tokens {
    460430   
    461     $_ = shift @_;
     431    my ($text) = @_;
     432    my $language_exp = $self->{'language_exp'};
    462433
    463434    # FIRST, remove GML tags
     435    $_ = $text;
    464436
    465437    # Replace all whitespace with a simple space
    466     s/\s+/ /gs;
     438    s/\s+/ /gso;
    467439
    468440    # Remove everything that is in a tag
    469     s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
    470     s/\s*<br>\s*/ LINEBREAK /isg;
    471     s/<[^>]*>/ /sg;
     441    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
     442    s/\s*<br>\s*/ LINEBREAK /isgo;
     443    s/<[^>]*>/ /sgo;
    472444
    473445    # Now we have the text, but it may contain HTML
    474446    # elements coded as &gt; etc.  Remove these tags.
    475     s/&lt;/</sg;
    476     s/&gt;/>/sg;
    477 
    478     s/\s+/ /sg;
    479     s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
    480     s/\s*<br>\s*/ LINEBREAK /isg;
    481     s/<[^>]*>/ /sg;
     447    s/&lt;/</sgo;
     448    s/&gt;/>/sgo;
     449
     450    s/\s+/ /sgo;
     451    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
     452    s/\s*<br>\s*/ LINEBREAK /isgo;
     453    s/<[^>]*>/ /sgo;
    482454
    483455    # remove &amp; and other miscellaneous markup tags
    484     s/&amp;/&/sg;
    485     s/&lt;/</sg;
    486     s/&gt;/>/sg;
    487     s/&amp;/&/sg;
     456    s/&amp;/&/sgo;
     457    s/&lt;/</sgo;
     458    s/&gt;/>/sgo;
     459    s/&amp;/&/sgo;
    488460
    489461    # replace<p> and <br> placeholders with carriage returns
    490     s/PARAGRAPHBREAK/\n/sg;
    491     s/LINEBREAK/\n/sg;
    492 
    493 
    494     # Exceptional punctuation
    495     #
    496     # We make special cases of some punctuation
    497 
    498     # remove any apostrophe that indicates omitted letters
    499     s/(\w+)\'(\w*\s)/ $1$2 /g;
    500 
    501     # remove period that appears in a person's initals
    502     s/\s([A-Z])\./ $1 /g;
    503 
    504     # replace hyphens in hypheanted words and names with a space
    505     s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
     462    s/PARAGRAPHBREAK/\n/sgo;
     463    s/LINEBREAK/\n/sgo;
     464
     465    $text = $_;
     466   
     467   
     468    # Language-specific word-cleanup
     469
     470    # English
     471    if ($language_exp =~ /en/) {
     472
     473    # remove any apostrophe that indicates omitted letters
     474    $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g;
     475
     476    # remove period that appears in a person's initals
     477    $text =~ s/\s([A-Z])\./ $1 /g;
     478
     479    # replace hyphens in hyphenated words and names with a space
     480    $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
     481
     482    }
    506483
    507484
    508485    # Convert the remaining text to "clause format",
     486
    509487    # This means removing all excess punctuation and garbage text,
    510488    # normalising valid punctuation to fullstops and commas,
    511     # then putting one cluse on each line.
     489    # then putting one clause on each line.
    512490
    513491    # Insert newline when the end of a sentence is detected
    514492    # (delimter is:  "[\.\?\!]\s")
    515     s/\s*[\.\?\!]\s+/\n/g;
    516 
    517     # split numbers after four digits
    518     s/(\d\d\d\d)/$1 /g;
    519 
    520     # split words after 32 characters
    521 
    522     # squash repeated punctuation
    523     tr/A-Za-z0-9 //cs;
    524 
    525     # save email addresses
    526     # s/\w+@\w+\.[\w\.]+/EMAIL/g;
    527 
    528     # normalise clause breaks (mostly punctuation symbols) to commas
    529     s/[^A-Za-z0-9 \n]+/ , /g;
    530 
    531     # Remove repeated commas, and replace with newline
    532     s/\s*,[, ]+/\n/g;
     493    $text =~ s/\s*[\.\?\!]\s+/\n/go;
     494
     495
     496    # Language-specific clause clean-up
     497
     498    # English
     499    if ($language_exp =~ /en/) {
     500
     501    # split numbers after four digits
     502    $text =~ s/(\d\d\d\d)/$1 /g;
     503
     504    # split words after 32 characters
     505
     506    # squash repeated punctuation
     507    $text =~ tr/A-Za-z0-9 //cs;
     508
     509    # normalise clause breaks (mostly punctuation symbols) to commas
     510    $text =~ s/[^A-Za-z0-9 \n]+/ , /g;
     511
     512    # Remove repeated commas, and replace with newline
     513    $text =~ s/\s*,[, ]+/\n/g;
     514    }
    533515
    534516    # remove extra whitespace
    535     s/ +/ /sg;
    536     s/^\s+//mg;
    537     s/\s*$/\n/mg;
     517    $_ = $text;
     518    s/ +/ /sgo;
     519    s/^\s+//mgo;
     520    s/\s*$/\n/mgo;
    538521
    539522    # remove lines that contain one word or less
    540     s/^\w*$//mg;
    541     s/^\s*$//mg;
     523    s/^\S*$//mgo;
     524    s/^\s*$//mgo;
    542525    tr/\n//s;
    543526
Note: See TracChangeset for help on using the changeset viewer.