Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1890

Timestamp:

2001-01-31T17:45:38+13:00 (23 years ago)

Author:

paynter

Message:

When multiple metadata fields have multiple values, get them all.
Initial (poor) support for multiple languages (will have to replace).
Some documentation removed.

File:

: 1 edited

trunk/gsdl/perllib/classify/phind.pm (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/classify/phind.pm

-              r1883
+              r1890
 ###########################################################################
+# The phind clasifier plugin
+#
+# options are:
+#  -button Name           The label for the classifiers button in the
+#                         navigation bar (defaults to "Phrase").
+#  -title Title           The metadata field used to describe each document
+#                         (defaults to "Title").
+#  -text fields           The text used to build the phrase hierarchy
+#                         (defaults to "section:Title,section:text").
+#  -phinddir directory    Location of phind index files
+#  -verbosity num         Control amount of output
+#  -untidy                Do not clean up intermediate files
+#  -suffixmode num        Mode of suffix program (0 = all phrases, 1 = stopword)
+#  -savephrases filename  If set, phrase infomation will be stored in filename
+#                         as text. (By defualt, it is not set.)
+#  -thesaurus name        Name of a thesaurus stred in phind format in etc dir.
+# How a classifier works.
+#
+# For each classifier requested in the collect.cfg file, buildcol.pl creates
+# a new classifier object (such as the one defined in theis file) and later
+# passes each document object to the classifier in turn for classification.
+#
+# Four functions are used:
+#
+# 1. "new" is called before the documents are processed to set up the
+#    classifier.
+#
+# 2. "init" is called after buildcol.pl has created the indexes etc but
+#    before the documents are classified in order that the classifier might
+#    set any variables it requires, etc.
+#
+# 3. "classify" is called once for each document object.  The classifier
+#    "classifies" each document and updates its local data accordingly.
+#
+# 4. "get_classify_info" is called after every document has been
+#    classified.  It collates the information about the documents and
+#    stores a reference to the classifier so that Greenstone can later
+#    display it.
+# The phind clasifier plugin.
+# Options are dexcribed in the print_usage function.
+# Type "classinfo.pl phind" at the command line for a summary.
 package phind;
 …
 use BasClas;
 use util;
 sub BEGIN {
     @ISA = ('BasClas');
+}
-# Define delimiter symbols - this should be abstracted out someplace
-my $colstart = "COLLECTIONSTART";
-my $colend   = "COLLECTIONEND";
-my $doclimit = "DOCUMENTLIMIT";
-my $senlimit = "SENTENCELIMIT";
-my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
 …
 "; }
+# Phrase delimiter symbols - these should be abstracted out someplace
+my $colstart = "COLLECTIONSTART";
+my $colend   = "COLLECTIONEND";
+my $doclimit = "DOCUMENTLIMIT";
+my $senlimit = "SENTENCELIMIT";
+my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
 # Create a new phind browser based on collect.cfg
 …
+}
 # Classify each document.
+#
 …
     my $indexes = $self->{'indexes'};
     my $text = "";
     my ($part, $level, $field, $section, $data);
+    my ($part, $level, $field, $section, $data, $dataref);
     foreach $part (split(/,/, $indexes)) {
 …
     # Extract a metadata field from a document
+    # (If ther eis more than one element of the given type, get them all.)
     elsif ($level eq "document") {
+        $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
+        $text .= convert_gml_to_tokens($data) . "\n";
+    }
+        $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
+        foreach $data ($$dataref) {
+        $text .= convert_gml_to_tokens($data) . "\n";
+        }
+    }
     # Extract metadata from every section in a document
     elsif ($level eq "section") {
 …
         $section = $doc_obj->get_top_section();
         while (defined($section)) {
+        $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
+        $dataref .= $doc_obj->get_metadata($section, $field);
+        $data .= join("\n", $$dataref) . "\n";
         $section = $doc_obj->get_next_section($section);
+        }
 …
     $text =~ tr/\n//s;
     print $txthandle "$text";
+}
 …
 sub convert_gml_to_tokens {
+    $_ = shift @_;
+    my ($text) = @_;
+    my $language_exp = $self->{'language_exp'};
     # FIRST, remove GML tags
+    $_ = $text;
     # Replace all whitespace with a simple space
     s/\s+/ /gs;
+    s/\s+/ /gso;
     # Remove everything that is in a tag
     s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
     s/\s*<br>\s*/ LINEBREAK /isg;
     s/<[^>]*>/ /sg;
+    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
+    s/\s*<br>\s*/ LINEBREAK /isgo;
+    s/<[^>]*>/ /sgo;
     # Now we have the text, but it may contain HTML
     # elements coded as &gt; etc.  Remove these tags.
     s/&lt;/</sg;
     s/&gt;/>/sg;
     s/\s+/ /sg;
     s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
     s/\s*<br>\s*/ LINEBREAK /isg;
     s/<[^>]*>/ /sg;
+    s/&lt;/</sgo;
+    s/&gt;/>/sgo;
+    s/\s+/ /sgo;
+    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
+    s/\s*<br>\s*/ LINEBREAK /isgo;
+    s/<[^>]*>/ /sgo;
     # remove &amp; and other miscellaneous markup tags
     s/&amp;/&/sg;
     s/&lt;/</sg;
     s/&gt;/>/sg;
     s/&amp;/&/sg;
+    s/&amp;/&/sgo;
+    s/&lt;/</sgo;
+    s/&gt;/>/sgo;
+    s/&amp;/&/sgo;
     # replace<p> and <br> placeholders with carriage returns
+    s/PARAGRAPHBREAK/\n/sg;
+    s/LINEBREAK/\n/sg;
+    # Exceptional punctuation
+    #
+    # We make special cases of some punctuation
+    # remove any apostrophe that indicates omitted letters
+    s/(\w+)\'(\w*\s)/ $1$2 /g;
+    # remove period that appears in a person's initals
+    s/\s([A-Z])\./ $1 /g;
+    # replace hyphens in hypheanted words and names with a space
+    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
+    s/PARAGRAPHBREAK/\n/sgo;
+    s/LINEBREAK/\n/sgo;
+    $text = $_;
+    # Language-specific word-cleanup
+    # English
+    if ($language_exp =~ /en/) {
+    # remove any apostrophe that indicates omitted letters
+    $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g;
+    # remove period that appears in a person's initals
+    $text =~ s/\s([A-Z])\./ $1 /g;
+    # replace hyphens in hyphenated words and names with a space
+    $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
+    }
     # Convert the remaining text to "clause format",
     # This means removing all excess punctuation and garbage text,
     # normalising valid punctuation to fullstops and commas,
     # then putting one cluse on each line.
+    # then putting one clause on each line.
     # Insert newline when the end of a sentence is detected
     # (delimter is:  "[\.\?\!]\s")
+    s/\s*[\.\?\!]\s+/\n/g;
+    # split numbers after four digits
+    s/(\d\d\d\d)/$1 /g;
+    # split words after 32 characters
+    # squash repeated punctuation
+    tr/A-Za-z0-9 //cs;
+    # save email addresses
+    # s/\w+@\w+\.[\w\.]+/EMAIL/g;
+    # normalise clause breaks (mostly punctuation symbols) to commas
+    s/[^A-Za-z0-9 \n]+/ , /g;
+    # Remove repeated commas, and replace with newline
+    s/\s*,[, ]+/\n/g;
+    $text =~ s/\s*[\.\?\!]\s+/\n/go;
+    # Language-specific clause clean-up
+    # English
+    if ($language_exp =~ /en/) {
+    # split numbers after four digits
+    $text =~ s/(\d\d\d\d)/$1 /g;
+    # split words after 32 characters
+    # squash repeated punctuation
+    $text =~ tr/A-Za-z0-9 //cs;
+    # normalise clause breaks (mostly punctuation symbols) to commas
+    $text =~ s/[^A-Za-z0-9 \n]+/ , /g;
+    # Remove repeated commas, and replace with newline
+    $text =~ s/\s*,[, ]+/\n/g;
+    }
     # remove extra whitespace
+    s/ +/ /sg;
+    s/^\s+//mg;
+    s/\s*$/\n/mg;
+    $_ = $text;
+    s/ +/ /sgo;
+    s/^\s+//mgo;
+    s/\s*$/\n/mgo;
     # remove lines that contain one word or less
     s/^\w*$//mg;
     s/^\s*$//mg;
+    s/^\S*$//mgo;
+    s/^\s*$//mgo;
     tr/\n//s;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1890

Legend:

trunk/gsdl/perllib/classify/phind.pm

Download in other formats: