Changeset 1604 for trunk/gsdl/src/phind


Ignore:
Timestamp:
2000-10-17T12:35:59+13:00 (24 years ago)
Author:
paynter
Message:

Numerous improvements for use with the new phindcgi script. The main ones
are that three MGPP databases are now created (document data, phrase data,
and word search) and that the data extracted from each document is set
explicitly in the collection configuration file (usually it will be
something like document:text or section:Title).

Location:
trunk/gsdl/src/phind/generate
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phindgen.pl

    r1591 r1604  
    7979   
    8080    my ($verbosity, $archivedir, $phindexdir,
    81     $phind, $language, $maxdocs, $untidy,
     81    $phindcfg, $language, $maxdocs, $untidy,
    8282    $collection, $configfilename, $collectcfg);
    8383   
     
    110110    }
    111111    if (defined $collectcfg->{'phind'}) {
    112         $phind = $collectcfg->{'phind'};
     112        $phindcfg = $collectcfg->{'phind'};
    113113    }
    114114    } else {
     
    125125
    126126    # Make sure theuser has in fact requested phind indexes
    127     if (!defined($phind)) {
     127    if (!defined($phindcfg)) {
    128128    print "No phind information in $configfilename\n";
    129129    exit;
     
    144144    # Read the archives directory and build the clauses file
    145145    print "\nReading archive directory\n" if $verbosity;
    146     &build_clauses($archivedir, $phindexdir, $language, $verbosity, $maxdocs);
     146    &build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
    147147
    148148    # Generate the vocabulary, symbol statistics, and numbers file
     
    152152
    153153    # Use the suffix program to generate the phindex/phrases file
    154     $command = "suffix $phindexdir $symbol_limit $mode";
    155     print "\nExecuting: $command\n" if $verbosity;
    156     $status = system($command);
    157     if ($status != 0) {
    158     print STDERR "phindgen.pl - Error executing $command: $!\n";
    159     exit($status);
    160     }
     154    &execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
    161155
    162156    # Create the phrase file and put phrase numbers in phindex/phrases
     
    168162    my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
    169163    my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
    170     my $mg_input = &util::filename_cat($phindexdir, "mg-p.txt");
    171 
    172     $command = "$mg_passes -d $phindexdir -f phrase -T1 $mg_input";
    173     print "\nExecuting: $command\n" if $verbosity;
    174     $status = system($command);
    175     if ($status != 0) {
    176     print STDERR "phindgen.pl - Error executing $command: $!\n";
    177     exit($status);
    178     }
    179 
    180     $command = "$mg_compression_dict -d $phindexdir -f phrase";
    181     print "\nExecuting: $command\n" if $verbosity;
    182     $status = system($command);
    183     if ($status != 0) {
    184     print STDERR "phindgen.pl - Error executing $command: $!\n";
    185     exit($status);
    186     }
    187 
    188     $command = "$mg_passes -d $phindexdir -f phrase -T2 $mg_input";
    189     print "\nExecuting: $command\n" if $verbosity;
    190     $status = system($command);
    191     if ($status != 0) {
    192     print STDERR "phindgen.pl - Error executing $command: $!\n";
    193     exit($status);
    194     }
     164
     165    my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
     166    my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
     167    my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
     168    my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
     169
     170    print "\nCreating phrase databases\n";
     171    my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
     172    my $mg_stem = "pdata";
     173
     174    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
     175    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
     176    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
     177
     178    # create the mg index of words
     179    print "\nCreating word-level search indexes\n";
     180    $mg_input = &util::filename_cat($phindexdir, "pword.txt");
     181    $mg_stem = "pword";
     182
     183    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
     184    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
     185    &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
     186    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
     187    &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
     188    &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
     189
     190    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
     191    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
     192    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
     193
     194    # create the mg document information database
     195    print "\nCreating document information databases\n";
     196    $mg_input = &util::filename_cat($phindexdir, "docs.txt");
     197    $mg_stem = "docs";
     198
     199    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
     200    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
     201    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
     202
    195203
    196204    # Tidy up stray files
     
    199207    &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
    200208          "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
    201           "$phindexdir/phrases", "$phindexdir/mg-p.txt");
     209          "$phindexdir/phrases", "$phindexdir/docs.txt");
    202210    my $outfile = 1;
    203211    while (-e "$phindexdir/outPhrase.$outfile") {
     
    208216}
    209217
    210 
     218# Execute a system command
     219
     220sub execute {
     221    my ($command, $verbosity) = @_;
     222    print "Executing: $command\n" if $verbosity;
     223    my $status = system($command);
     224    if ($status != 0) {
     225    print STDERR "phindgen.pl - Error executing $command: $!\n";
     226    exit($status);
     227    }
     228}
    211229
    212230
     
    218236
    219237sub build_clauses {
    220     my ($archive_dir, $phindex_dir, $language, $verbosity, $maxdocs) = @_;
     238    my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
    221239
    222240    # create a "pluginfo" for ArcPlug and RecPlug
     
    227245   
    228246    # create a phind document processor object to process the documents
    229     my $processor = new phproc ($archive_dir, $phindex_dir, $language,
     247    my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
    230248                $doclimit, $verbosity, "STDOUT");
    231249
     
    476494
    477495    # Sort the phrases into order of increasing frequency
     496    # This means the expansions will be sorted correctly later on.
    478497    print "Sorting phrases into freq order\n" if ($verbosity);
    479498    system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
     
    495514    #
    496515    # The phrases file looks something like this
    497     # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
    498     # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
    499     # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
    500     # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
    501 
    502 
     516    #  159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
     517    #  159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
     518    #  159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
     519    #  159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
     520
     521    # The first field on each line is a unique phrase identifier. 
     522    # We need to calculate phrase numbers for each phrase
    503523    print "Calculate phrase numbers\n" if ($verbosity);
    504524   
    505     open(IN, "<$phindex_dir/phrases");
    506 
    507525    my %phrasenumber;
    508526    my $nextphrase = 1;
    509    
    510     my ($line, $num);
    511    
     527    my ($line);
     528   
     529    open(IN, "<$phindex_dir/phrases");
    512530    while(<IN>) {
    513531   
     
    516534    $line = $_;
    517535   
    518     # we're only interested in ther first field
     536    # we're only interested in the first field
    519537    $line =~ s/:.*//;
    520538   
    521539    # get a phrase number for this line
    522     $num = $nextphrase;
    523     $phrasenumber{$line} = $num;
     540    $phrasenumber{$line} = $nextphrase;
    524541    $nextphrase++;
    525542    }
    526543   
    527     # Extract the phrase data
    528     print "Create phrase file and frequency file\n" if ($verbosity);
     544
     545    # Now we create a new phrase file using phrase numbers, not the old IDs.
     546    print "Format phrase data for MGPP\n" if ($verbosity);
    529547   
    530548    open(IN, "<$phindex_dir/phrases");
    531     open(DATA, ">$phindex_dir/mg-p.txt");
    532    
    533     my ($key, $tf, $countexp, $expansions, $countdocs, $documents, $text);
     549    open(DATA, ">$phindex_dir/pdata.txt");
     550    open(IDX, ">$phindex_dir/pword.txt");
     551   
     552    my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
    534553    my @fields;
    535554    my @documents;
     
    545564    @fields = split(/:/, $line);
    546565   
    547     # output the MG document tag
    548     print DATA "<Document>";
    549    
    550566    # get a phrase number for this line
    551567    $key = shift @fields;
    552568    die unless (defined($phrasenumber{$key}));
    553569    $num = $phrasenumber{$key};
    554     print DATA "$num:";
    555570   
    556571    # get the text of the phrase
    557572    $text = shift @fields;
    558573    $text =~ s/s(\d+)/$symbol[$1]/g;
    559     print DATA "$text:";
    560    
     574    if ($text =~ / /) {
     575        $word = "";
     576    } else {
     577        $word = $text;
     578    }
     579
    561580    $linenumber++;
    562581    if ($linenumber % 1000 == 0) {
     
    567586    # get the phrase frequency
    568587    $tf = shift @fields;
    569     print DATA "$tf:";
    570588   
    571589    # get the number of expansions
    572590    $countexp = shift @fields;
    573     print DATA "$countexp:";
    574591   
    575592    # get the expansions and convert them into phrase numbers
     
    581598        push @newexp, $n;
    582599    }
    583     print DATA join(",", (sort numerically @newexp)), ":";
    584    
     600    @newexp = sort numerically @newexp;
     601
    585602    # get the number of documents
    586603    $countdocs = shift @fields;
    587     print DATA "$countdocs:";
    588604   
    589605    # get the documents
     
    591607    $documents =~ s/d//g;
    592608    @documents = split(/;/, $documents);
    593    
    594     print DATA join(";", (sort by_frequency @documents));
    595    
    596     print DATA "\n";
     609    @documents = sort by_frequency @documents;
     610
     611    # output the phrase data
     612    print DATA "<Document>";
     613    print DATA "$num:$text:$tf:$countexp:$countdocs:";
     614    print DATA join(",", @newexp), ":", join(";", @documents), "\n";
     615   
     616    # output the word index search data
     617    print IDX "<Document>$word\n";
     618
    597619   
    598620    }
  • trunk/gsdl/src/phind/generate/phproc.pm

    r1562 r1604  
    3838
    3939sub new {
    40     my ($class, $archive_dir, $phindex_dir,
     40    my ($class, $archive_dir, $phindex_dir, $phindcfg,
    4141    $language, $delimiter, $verbosity, $outhandle) = @_;
    4242
    4343    my $self = new docproc ();
    4444   
    45     # $self->{'collection'} = $collection;
    4645    $self->{'archive_dir'} = $archive_dir;
    4746    $self->{'phindex_dir'} = $phindex_dir;
     47    $self->{'indexes'} = $phindcfg;
    4848
    4949    $language =~ s/,/\|/g;
     
    6060    $self->{'txthandle'} = TEXT;
    6161
    62     &util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt");
    63     open(DOCS, ">$phindex_dir/mg-d.txt")
    64     || die "Cannot open $phindex_dir/mg-d.txt: $!";
     62    my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
     63    &util::rm($docfile) if (-e $docfile);
     64    open(DOCS, ">$docfile")
     65    || die "Cannot open $docfile: $!";
    6566    $self->{'dochandle'} = DOCS;
    6667
     
    9596    my $dochandle = $self->{'dochandle'};
    9697    # print "dochandle: =$dochandle=\n";
    97     print $dochandle "$OID\t$title\n";
    98    
    99     # store the text
    100     $text = convert_gml_to_tokens($doc_obj->get_text());
    101 
     98    print $dochandle "<Document>\t$OID\t$title\n";
     99   
     100    # XXX
     101    # Store the text of this object
     102    my $indexlist = $self->{'indexes'};
     103    my @parts;
     104    my ($index, $part, $level, $field, $section, $data, $text);
     105
     106    # Output the document delimiter
    102107    my $txthandle = $self->{'txthandle'};
    103     print $txthandle $self->{'delimiter'}, "\n$text\n";
     108    print $txthandle $self->{'delimiter'}, "\n";
     109
     110    # Iterarate over all the indexes specified in collect.cfg and
     111    # add their text to the clauses file.
     112    foreach $index (@$indexlist) {
     113    $text = "";
     114
     115    # Iterate over all the feilds in each index
     116    @parts = split(/,/, $index);
     117    foreach $part (@parts) {
     118
     119        # Each field has a level and a data element ((e.g. document:Title)
     120        ($level, $field) = split(/:/, $part);
     121        die unless ($level && $field);
     122
     123        # Extract the text from every section
     124        # (In phind, document:text and section:text are equivalent)
     125        if ($field eq "text") {
     126        $data = "";
     127        $section = $doc_obj->get_top_section();
     128        while (defined($section)) {
     129            $data .= $doc_obj->get_text($section) . "\n";
     130            $section = $doc_obj->get_next_section($section);
     131        }
     132        $text .= convert_gml_to_tokens($data) . "\n";
     133        }
     134
     135        # Extract a metadata field from a document
     136        elsif ($level eq "document") {
     137        $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
     138        $text .= convert_gml_to_tokens($data) . "\n";
     139        }
     140       
     141        # Extract metadata from every section in a document
     142        elsif ($level eq "section") {
     143        $data = "";
     144        $section = $doc_obj->get_top_section();
     145        while (defined($section)) {
     146            $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
     147            $section = $doc_obj->get_next_section($section);
     148        }
     149        $text .= convert_gml_to_tokens($data) . "\n";
     150        }
     151       
     152        # Some sort of specification which I don't understand
     153        else {
     154        die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
     155        }
     156
     157    }
     158
     159    # print the text
     160    print $txthandle "$text";
     161
     162    }
    104163}
    105164
Note: See TracChangeset for help on using the changeset viewer.