Changeset 1803


Ignore:
Timestamp:
2000-12-18T14:18:19+13:00 (23 years ago)
Author:
paynter
Message:

Moved the phind classifier's data directory into the index directory. This
means we no longer overwrite existing phind classifier data during a build.
I had to tweak the classifier code to pass the locatin of the building
directory to each classifer as an argument.

Location:
trunk/gsdl
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1646 r1803  
    3030#
    3131# options are:
    32 #   title=Title           The title field for this classification
     32#   button=Name           The label for the classifiers button in the
     33#                         navigation bar (defaults to "Topic").
     34#   title=Title           The metadata field used to describe each document
     35#                         (defaults to "Title").
    3336#   text=fields           The text used to build the phrase hierarchy
    34 #   phindexdir=directory  Location of phind index files
     37#                         (defaults to "section:Title,section:text").
     38#   phinddir=directory  Location of phind index files
    3539#   verbosity=num         Control amount of output
    3640#   untidy=true           Do not clean up intermediate files
     
    4145# How a classifier works. 
    4246#
    43 # When a classifier is requested in the collect.cfg file, buildcol creates a
    44 # new classifier object (such as the one defined in theis file) and later
    45 # passes each document object to the classifier in turn.  Four functions are
    46 # used:
     47# For each classifier requested in the collect.cfg file, buildcol.pl creates
     48# a new classifier object (such as the one defined in theis file) and later
     49# passes each document object to the classifier in turn for classification.
     50#
     51# Four functions are used:
    4752#
    4853# 1. "new" is called before the documents are processed to set up the
     
    5156# 2. "init" is called after buildcol.pl has created the indexes etc but
    5257#    before the documents are classified in order that the classifier might
    53 #    set any varioables it requiers, etc.
     58#    set any variables it requires, etc.
    5459#
    5560# 3. "classify" is called once for each document object.  The classifier
     
    136141    exit(1);
    137142    }
    138    
    139143
    140144    # The installation appears OK - set up the classifier
    141145    my $collection = $ENV{'GSDLCOLLECTION'};
    142     my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
    143146    my $language = "english";
    144147
    145     my $title = "Topic";
     148    my $button = "Phrase";
     149    my $title = "Title";
    146150    my $indexes = "section:Title,section:text";
     151
     152    my $builddir = "";
     153    my $phinddir = "";
    147154
    148155    my $suffixmode = 1;
     
    154161    # parse the options
    155162    foreach $option (@options) {
     163
     164    print STDERR "option: $option\n";
    156165
    157166    if ($option =~ /^text=(.*)$/i) {
     
    159168    } elsif ($option =~ /^title=(.*)$/i) {
    160169        $title = $1;
    161     } elsif ($option =~ /^phindexdir=(.*)$/i) {
    162         $phindexdir = $1;
     170    } elsif ($option =~ /^button=(.*)$/i) {
     171        $button = $1;
     172    } elsif ($option =~ /^builddir=(.*)$/i) {
     173        $builddir = $1;
     174    } elsif ($option =~ /^phinddir=(.*)$/i) {
     175        $phinddir = $1;
    163176    } elsif ($option =~ /^suffixsize=(.*)$/i) {
    164177        $suffixsize = $1;
     
    172185    }
    173186
    174 
     187    # classifier information
    175188    $self->{'collection'} = $collection;
    176     $self->{'title'} = $title;
     189    $self->{'titlefield'} = $title;
     190    $self->{'buttonname'} = $button;
    177191    $self->{'indexes'} = $indexes;
    178192
     193    # phrase extraction options
    179194    $self->{'suffixmode'} = $suffixmode;
    180195    $self->{'suffixsize'} = $suffixsize;
    181 
    182     $self->{'verbosity'} = $verbosity;
    183     $self->{'untidy'} = $untidy;
    184196
    185197    # limit languages
     
    188200    $self->{'delimiter'} = $delimiter;
    189201
    190     # reset phindex directory
    191     if (-e "$phindexdir") {
    192     &util::rm_r("$phindexdir");
    193     }
    194     &util::mk_dir("$phindexdir");
    195     $self->{'phindexdir'} = $phindexdir;
     202    # build directory
     203    if (!$builddir) {
     204    $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
     205    }
     206    $self->{'builddir'} = $builddir;
     207
     208    # phind directory
     209    if (!$phinddir) {
     210        $phinddir = &util::filename_cat($builddir, "phind");
     211    }
     212    $self->{'phinddir'} = $phinddir;
     213
     214    # debugging levels
     215    $self->{'verbosity'} = $verbosity;
     216    $self->{'untidy'} = $untidy;
    196217
    197218    return bless $self, $class;
     
    204225    my $self = shift (@_);
    205226
     227    # ensure we have a build directory
     228    my $builddir = $self->{'builddir'};
     229    die unless (-e "$builddir");
     230
     231    # create phind directory
     232    my $phinddir = $self->{'phinddir'};
     233    if (-e "$phinddir") {
     234    &util::rm_r("$phinddir");
     235    }
     236    &util::mk_dir("$phinddir");
     237
    206238    # open filehandles for documents and text
    207     my $phindexdir = $self->{'phindexdir'};
    208 
    209     my $clausefile =  &util::filename_cat("$phindexdir", "clauses");
     239    my $clausefile =  &util::filename_cat("$phinddir", "clauses");
    210240    &util::rm($clausefile) if (-e $clausefile);
    211241    open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
    212242    $self->{'txthandle'} = TEXT;
    213243
    214     my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
     244    my $docfile = &util::filename_cat("$phinddir", "docs.txt");
    215245    &util::rm($docfile) if (-e $docfile);
    216246    open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
     
    232262    my $top_section = $doc_obj->get_top_section();
    233263
    234     my $title = $doc_obj->get_metadata_element ($top_section, "Title");
     264    my $titlefield = $self->{'titlefield'};
     265   
     266    my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
    235267    print "process: $title\n" if ($verbosity > 2);
    236268
     
    316348#
    317349# When get_classify_info is called, the clauses and docs.txt files have
    318 # already been constructed in the phindex directory.  This function will
     350# already been constructed in the phind directory.  This function will
    319351# translate them into compressed, indexed MGPP files that can be read by
    320352# the phindcgi script.  It will also register our classifier so that it
    321 # shows up in thenavigation bar.
     353# shows up in the navigation bar.
    322354
    323355sub get_classify_info {
     
    325357
    326358    my $verbosity = $self->{'verbosity'};
    327     my $phindexdir = $self->{'phindexdir'};
     359    my $phinddir = $self->{'phinddir'};
    328360    my $language = "english";
    329361   
     
    340372    # from the clauses file
    341373    print "\nExtracting vocabulary and statistics\n" if $verbosity;
    342     &extract_vocabulary($phindexdir, $language, $verbosity);
    343 
    344     # Use the suffix program to generate the phindex/phrases file
     374    &extract_vocabulary($phinddir, $language, $verbosity);
     375
     376    # Use the suffix program to generate the phind/phrases file
    345377    print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
    346     &execute("suffix $phindexdir $suffixsize $suffixmode", $verbosity);
    347 
    348     # Create the phrase file and put phrase numbers in phindex/phrases
     378    &execute("suffix $phinddir $suffixsize $suffixmode", $verbosity);
     379
     380    # Create the phrase file and put phrase numbers in phind/phrases
    349381    print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
    350     &renumber_phrases("$phindexdir", $verbosity);
     382    &renumber_phrases("$phinddir", $verbosity);
    351383   
    352384    # Create the mg phrase database
     
    361393
    362394    print "\nCreating phrase databases\n";
    363     my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
     395    my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
    364396    my $mg_stem = "pdata";
    365397
    366     &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
    367     &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
    368     &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
     398    &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity);
     399    &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
     400    &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity);
    369401
    370402    # create the mg index of words
    371403    print "\nCreating word-level search indexes\n";
    372     $mg_input = &util::filename_cat($phindexdir, "pword.txt");
     404    $mg_input = &util::filename_cat($phinddir, "pword.txt");
    373405    $mg_stem = "pword";
    374406
    375     &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
    376     &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
    377     &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
    378     &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
    379     &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
    380     &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
    381 
    382     &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
    383     &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
    384     &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
     407    &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
     408    &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
     409    &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity);
     410    &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
     411    &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity);
     412    &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity);
     413
     414    &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity);
     415    &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity);
     416    &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity);
    385417
    386418    # create the mg document information database
    387419    print "\nCreating document information databases\n";
    388     $mg_input = &util::filename_cat($phindexdir, "docs.txt");
     420    $mg_input = &util::filename_cat($phinddir, "docs.txt");
    389421    $mg_stem = "docs";
    390422
    391     &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
    392     &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
    393     &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
     423    &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity);
     424    &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
     425    &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity);
    394426
    395427
     
    397429    if (!$untidy) {
    398430    print "\nCleaning up\n" if ($verbosity > 2);
    399     &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
    400           "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
    401           "$phindexdir/phrases", "$phindexdir/docs.txt",
    402           "$phindexdir/pdata.txt", "$phindexdir/pword.txt");
     431    &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
     432          "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
     433          "$phinddir/phrases", "$phinddir/docs.txt",
     434          "$phinddir/pdata.txt", "$phinddir/pword.txt");
    403435    my $outfile = 1;
    404     while (-e "$phindexdir/outPhrase.$outfile") {
    405         &util::rm("$phindexdir/outPhrase.$outfile");
     436    while (-e "$phinddir/outPhrase.$outfile") {
     437        &util::rm("$phinddir/outPhrase.$outfile");
    406438        $outfile++;
    407439    }
     
    415447    my %classifyinfo = ('thistype'=>'Invisible',
    416448            'childtype'=>'Phind',
    417             'Title'=>$self->{'title'},
     449            'Title'=>$self->{'buttonname'},
    418450            'contains'=>[]);
    419451   
     
    531563
    532564sub extract_vocabulary {
    533     my ($phindex_dir, $language, $verbosity) = @_;
     565    my ($phind_dir, $language, $verbosity) = @_;
    534566
    535567    my ($w, $l, $line, $word);
     
    575607
    576608    # Read words in the text and count occurences
    577     open(TXT, "<$phindex_dir/clauses");
     609    open(TXT, "<$phind_dir/clauses");
    578610    my @words;
    579611   
     
    684716
    685717    # Outut the words
    686     print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
    687     open(VOC, ">$phindex_dir/clauses.vocab");
     718    print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1);
     719    open(VOC, ">$phind_dir/clauses.vocab");
    688720
    689721    for (my $i = 1; $i < $nextsymbol; $i++) {
     
    697729
    698730    # Output statistics about the vocablary
    699     print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
    700     &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
    701     open(STAT, ">$phindex_dir/clauses.stats")
    702     || die "Cannot open $phindex_dir/clauses.stats: $!";
     731    print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1);
     732    &util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats");
     733    open(STAT, ">$phind_dir/clauses.stats")
     734    || die "Cannot open $phind_dir/clauses.stats: $!";
    703735
    704736    print STAT "first_delimiter $first_delimiter\n";
     
    724756
    725757    # Save text as symbol numbers
    726     print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
    727    
    728     open(TXT, "<$phindex_dir/clauses");
    729     open(NUM, ">$phindex_dir/clauses.numbers");
     758    print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1);
     759   
     760    open(TXT, "<$phind_dir/clauses");
     761    open(NUM, ">$phind_dir/clauses.numbers");
    730762   
    731763    $phrasedelimiter = $symbol{lc($senlimit)};
     
    763795
    764796sub renumber_phrases {
    765     my ($phindex_dir, $verbosity) = @_;
     797    my ($phind_dir, $verbosity) = @_;
    766798
    767799    # Sort the phrases into order of increasing frequency
    768800    # This means the expansions will be sorted correctly later on.
    769801    print "Sorting phrases into freq order\n" if ($verbosity);
    770     system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
     802    system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases");
    771803
    772804    my @symbol;
     
    774806    # Read the vocabulary
    775807    print "Reading the vocabulary\n" if ($verbosity);
    776     open(V, "<$phindex_dir/clauses.vocab")
    777     || die "Cannot open $phindex_dir/clauses.vocab: $!";
     808    open(V, "<$phind_dir/clauses.vocab")
     809    || die "Cannot open $phind_dir/clauses.vocab: $!";
    778810   
    779811    my $i = 1;
     
    799831    my ($line);
    800832   
    801     open(IN, "<$phindex_dir/phrases");
     833    open(IN, "<$phind_dir/phrases");
    802834    while(<IN>) {
    803835   
     
    818850    print "Format phrase data for MGPP\n" if ($verbosity);
    819851   
    820     open(IN, "<$phindex_dir/phrases");
    821     open(DATA, ">$phindex_dir/pdata.txt");
    822     open(IDX, ">$phindex_dir/pword.txt");
     852    open(IN, "<$phind_dir/phrases");
     853    open(DATA, ">$phind_dir/pdata.txt");
     854    open(IDX, ">$phind_dir/pword.txt");
    823855   
    824856    my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
  • trunk/gsdl/perllib/mgbuilder.pm

    r1799 r1803  
    139139   
    140140    # load all the classifiers
    141     $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle);
     141    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
    142142
    143143    # load up any dontgdbm fields
  • trunk/gsdl/src/phind/host/phindcgi.cpp

    r1634 r1803  
    144144  strcat(basepath, "/collect/");
    145145  strcat(basepath, collection);
    146   strcat(basepath, "/phindex");
     146  strcat(basepath, "/index/phind");
    147147
    148148  // If we don't know the phrase number, look itup
     
    720720 
    721721  // mode 1 = casefolded, unstemmed search
    722   QueryNode *queryTree = ParseQuery(query, 1);
     722  QueryNode *queryTree = ParseQuery(query, 1, 1);
    723723
    724724  // cout << "-- query --" << endl;
Note: See TracChangeset for help on using the changeset viewer.