Changeset 1808


Ignore:
Timestamp:
2000-12-19T12:03:38+13:00 (23 years ago)
Author:
paynter
Message:

Option to save the phind phrases to a text file.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1803 r1808  
    3131# options are:
    3232#   button=Name           The label for the classifiers button in the
    33 #                         navigation bar (defaults to "Topic").
     33#                         navigation bar (defaults to "Phrase").
    3434#   title=Title           The metadata field used to describe each document
    3535#                         (defaults to "Title").
    3636#   text=fields           The text used to build the phrase hierarchy
    3737#                         (defaults to "section:Title,section:text").
    38 #   phinddir=directory  Location of phind index files
     38#   phinddir=directory    Location of phind index files
    3939#   verbosity=num         Control amount of output
    4040#   untidy=true           Do not clean up intermediate files
    4141#   suffixmode=num        Mode of suffix program (0 = all phrases, 1 = stopword)
    4242#   suffixsize=num        Number of symbols available to suffix program
    43 
     43#   savephrases=filename  If set, phrase infomation will be stored in filename
     44#                         as text. (By defualt, it is not set.)
    4445
    4546# How a classifier works. 
     
    155156    my $suffixmode = 1;
    156157    my $suffixsize = 40000000;
     158    my $savephrases = "";
    157159
    158160    my $verbosity = 2;
     
    174176    } elsif ($option =~ /^phinddir=(.*)$/i) {
    175177        $phinddir = $1;
     178    } elsif ($option =~ /^savephrases=(.*)$/i) {
     179        $savephrases = $1;
    176180    } elsif ($option =~ /^suffixsize=(.*)$/i) {
    177181        $suffixsize = $1;
    178182    } elsif ($option =~ /^suffixmode=(.*)$/i) {
    179183        $suffixmode = $1;
     184    } elsif ($option =~ /^untidy/i) {
     185        $untidy = 1;
    180186    } elsif ($option =~ /^verbosity=(.*)$/i) {
    181187        $verbosity = $1;
    182     } elsif ($option =~ /^untidy/i) {
    183         $untidy = 1;
    184188    }
    185189    }
     
    194198    $self->{'suffixmode'} = $suffixmode;
    195199    $self->{'suffixsize'} = $suffixsize;
     200    $self->{'savephrases'} = $savephrases if ($savephrases);
    196201
    197202    # limit languages
     
    380385    # Create the phrase file and put phrase numbers in phind/phrases
    381386    print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
    382     &renumber_phrases("$phinddir", $verbosity);
     387    &renumber_phrases($self);
    383388   
    384389    # Create the mg phrase database
     
    427432
    428433    # Tidy up stray files
    429     if (!$untidy) {
     434    if (!$self->{'untidy'}) {
    430435    print "\nCleaning up\n" if ($verbosity > 2);
    431436    &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
     
    795800
    796801sub renumber_phrases {
    797     my ($phind_dir, $verbosity) = @_;
     802    my $self = shift (@_);
     803
     804    my $verbosity = $self->{'verbosity'};
     805    my $phind_dir = $self->{'phinddir'};
     806
     807    my $savephrases = 0;
     808    $savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'}));
     809
     810                         
    798811
    799812    # Sort the phrases into order of increasing frequency
     
    802815    system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases");
    803816
     817    # Read the vocabulary
    804818    my @symbol;
    805 
    806     # Read the vocabulary
    807819    print "Reading the vocabulary\n" if ($verbosity);
    808820    open(V, "<$phind_dir/clauses.vocab")
     
    846858    }
    847859   
    848 
     860   
    849861    # Now we create a new phrase file using phrase numbers, not the old IDs.
    850862    print "Format phrase data for MGPP\n" if ($verbosity);
    851863   
     864    # Open the basic files
    852865    open(IN, "<$phind_dir/phrases");
    853866    open(DATA, ">$phind_dir/pdata.txt");
    854867    open(IDX, ">$phind_dir/pword.txt");
    855868   
     869    # We may want to save the phrases in a separate text file
     870    if ($savephrases) {
     871    print "Saving phrases in $savephrases\n" if ($verbosity);
     872    open(SAVE, ">$savephrases");
     873    }
     874
    856875    my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
    857876    my @fields;
     
    921940    print IDX "<Document>$word\n";
    922941
    923    
    924     }
     942    # output the phrases to a text file
     943    if ($savephrases) {
     944        print SAVE "$tf\t$countdocs\t$text\n";
     945    }
     946
     947    }
     948
     949    close SAVE if ($savephrases);
    925950}
    926951
Note: See TracChangeset for help on using the changeset viewer.