Ignore:
Timestamp:
2001-01-31T14:10:26+13:00 (23 years ago)
Author:
paynter
Message:

Supports new parameters of suffix program and new stopword file locations.
Filtering based on language now works. print_usage updated.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1871 r1883  
    3030#
    3131# options are:
    32  button=Name           The label for the classifiers button in the
     32-button Name           The label for the classifiers button in the
    3333#                         navigation bar (defaults to "Phrase").
    3434#  -title Title           The metadata field used to describe each document
     
    4040#  -untidy                Do not clean up intermediate files
    4141#  -suffixmode num        Mode of suffix program (0 = all phrases, 1 = stopword)
    42 #  -suffixsize num        Number of symbols available to suffix program
    4342#  -savephrases filename  If set, phrase infomation will be stored in filename
    4443#                         as text. (By defualt, it is not set.)
     
    9190
    9291  options:
    93    -title        Title to use on web pages
    94    -text
    95    -title
    96    -button
    97    -language
    98    -savephrases
    99    -suffixsize
    100    -suffixmode
    101    -thesaurus
    102    -untidy
    103 ";
    104 }
     92   -text Fields    The text used to build the phrase hierarchy.
     93                   (default: 'section:Title,section:text')
     94
     95   -title Title    The metadata field used to describe each document.
     96                   (default: 'Title')
     97
     98   -button Name    The label for the classifier screen and button in
     99                   navigation bar.
     100                   (default: 'Phrase')
     101
     102   -language Regex Language or languages to use building hierarchy.
     103                   Languages are identified by two-letter country codes
     104                   like en (English), es (Spanish), and fr (French).
     105                   Language is a regular expression, so 'en|fr' (English or
     106                   French) and '..' (match any language) are valid.
     107                   (default: 'en'.)
     108
     109   -savephrases File If set, the phrase infomation will be stored in
     110                     the given file as text. It is probably a good idea
     111                     to use an absolute path.
     112                     (defualt: not set)
     113
     114   -suffixmode N   The smode parameter to the phrase extraction program.  A
     115                   value of 0 means that stopwords are ignored, and of 1
     116                   means that stopwords are used.
     117                   (default: 1)
     118
     119   -thesaurus Name Name of a thesaurus stored in phind format in the
     120                   collection's etc directory.
     121                   (default: not set)
     122
     123   -untidy         Don't remove working files.
     124
     125"; }
    105126
    106127# Create a new phind browser based on collect.cfg
     
    174195             q^builddir/.*/^, \$builddir,
    175196             q^savephrases/\d/0^, \$self->{'savephrases'},
    176              q^suffixsize/\d+/100000^, \$self->{'suffixsize'},
    177197             q^suffixmode/\d/1^, \$self->{'suffixmode'},
    178198             q^thesaurus/.*/^, \$self->{'thesaurus'},
     
    250270    print "process: $title\n" if ($verbosity > 2);
    251271
    252     # only consider english-language files
     272    # Only consider the file if it is in the correct language
    253273    my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
    254274    my $phrlanguage = $self->{'language_exp'};
     
    340360    my $verbosity = $self->{'verbosity'};
    341361    my $out = $self->{'outhandle'};
    342 
    343362    my $phinddir = $self->{'phinddir'};
    344     my $language = "english";
    345    
     363
    346364    if ($verbosity) {
    347365    print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
     
    350368    # Construct phind indexes
    351369    my $suffixmode = $self->{'suffixmode'};
    352     my $suffixsize = $self->{'suffixsize'};
    353370    my ($command, $status);
    354371   
     
    360377    # Use the suffix program to generate the phind/phrases file
    361378    print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
    362     &execute("suffix $phinddir $suffixsize $suffixmode", $verbosity, $out);
     379    &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
    363380
    364381    # Create the phrase file and put phrase numbers in phind/phrases
     
    552569    my $out = $self->{'outhandle'};
    553570
    554     my $language = "english"; # $self->{'language'};
    555 
    556571    my $collectiondir = $self->{'collectiondir'};
    557 
    558572    my $phinddir = $self->{'phinddir'};
     573
     574    my $language_exp = $self->{'language_exp'};
    559575
    560576    my ($w, $l, $line, $word);
     
    576592
    577593    # Read and store the stopwords
    578     my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
     594    my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
     595    my $stopword_files = ();
     596    my ($language, $language_dir, $file, $file_name);
    579597    my %stopwords;
    580     foreach $w (split(/\s+/, $words)) {
    581     $l = lc($w);
    582     $stopwords{$l} = $w;
    583     }
    584    
     598
     599    # Examine each directory in the stopword directory
     600    opendir(STOPDIR, $stopdir);
     601    foreach $language (readdir STOPDIR) {
     602
     603    # Ignore entries that do not match the classifier's language
     604    next unless ($language =~ /$language_exp/);
     605    $language_dir = &util::filename_cat($stopdir, $language);
     606    next unless (-d "$language_dir");
     607
     608    opendir(LANGDIR, $language_dir);
     609    foreach $file (readdir LANGDIR) {
     610
     611        # Ignore entries that are not stopword files
     612        next unless ($file =~ /sw$/);
     613        $file_name = &util::filename_cat($language_dir, $file);
     614        next unless (-f "$file_name");
     615
     616        # Read the stopwords
     617        open(STOPFILE, "<$file_name");
     618        while (<STOPFILE>) {
     619        s/^\s+//;
     620        s/\s.*//;
     621        $word = $_;
     622        $l = lc($word);
     623        $stopwords{$l} = $word;
     624        }
     625        close STOPFILE;
     626
     627    }
     628    }
     629
    585630    # Read thesaurus information
    586631    if ($thesaurus) {
Note: See TracChangeset for help on using the changeset viewer.