Context Navigation

← Previous Change
Next Change →

phind

Timestamp:

2000-10-17T12:35:59+13:00 (24 years ago)

Author:

paynter

Message:

Numerous improvements for use with the new phindcgi script. The main ones
are that three MGPP databases are now created (document data, phrase data,
and word search) and that the data extracted from each document is set
explicitly in the collection configuration file (usually it will be
something like document:text or section:Title).

Location:

trunk/gsdl/src/phind/generate

Files:

: 2 edited

phindgen.pl (modified) (17 diffs)
phproc.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/phind/generate/phindgen.pl

-              r1591
+              r1604
     my ($verbosity, $archivedir, $phindexdir,
     $phind, $language, $maxdocs, $untidy,
+    $phindcfg, $language, $maxdocs, $untidy,
     $collection, $configfilename, $collectcfg);
 …
+    }
     if (defined $collectcfg->{'phind'}) {
         $phind = $collectcfg->{'phind'};
+        $phindcfg = $collectcfg->{'phind'};
+    }
     } else {
 …
     # Make sure theuser has in fact requested phind indexes
     if (!defined($phind)) {
+    if (!defined($phindcfg)) {
     print "No phind information in $configfilename\n";
     exit;
 …
     # Read the archives directory and build the clauses file
     print "\nReading archive directory\n" if $verbosity;
     &build_clauses($archivedir, $phindexdir, $language, $verbosity, $maxdocs);
+    &build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
     # Generate the vocabulary, symbol statistics, and numbers file
 …
     # Use the suffix program to generate the phindex/phrases file
+    $command = "suffix $phindexdir $symbol_limit $mode";
+    print "\nExecuting: $command\n" if $verbosity;
+    $status = system($command);
+    if ($status != 0) {
+    print STDERR "phindgen.pl - Error executing $command: $!\n";
+    exit($status);
+    }
+    &execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
     # Create the phrase file and put phrase numbers in phindex/phrases
 …
     my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
     my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
+    my $mg_input = &util::filename_cat($phindexdir, "mg-p.txt");
+    $command = "$mg_passes -d $phindexdir -f phrase -T1 $mg_input";
+    print "\nExecuting: $command\n" if $verbosity;
+    $status = system($command);
+    if ($status != 0) {
+    print STDERR "phindgen.pl - Error executing $command: $!\n";
+    exit($status);
+    }
+    $command = "$mg_compression_dict -d $phindexdir -f phrase";
+    print "\nExecuting: $command\n" if $verbosity;
+    $status = system($command);
+    if ($status != 0) {
+    print STDERR "phindgen.pl - Error executing $command: $!\n";
+    exit($status);
+    }
+    $command = "$mg_passes -d $phindexdir -f phrase -T2 $mg_input";
+    print "\nExecuting: $command\n" if $verbosity;
+    $status = system($command);
+    if ($status != 0) {
+    print STDERR "phindgen.pl - Error executing $command: $!\n";
+    exit($status);
+    }
+    my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
+    my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
+    my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
+    my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
+    print "\nCreating phrase databases\n";
+    my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
+    my $mg_stem = "pdata";
+    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
+    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
+    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
+    # create the mg index of words
+    print "\nCreating word-level search indexes\n";
+    $mg_input = &util::filename_cat($phindexdir, "pword.txt");
+    $mg_stem = "pword";
+    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
+    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
+    &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
+    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
+    &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
+    &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
+    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
+    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
+    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
+    # create the mg document information database
+    print "\nCreating document information databases\n";
+    $mg_input = &util::filename_cat($phindexdir, "docs.txt");
+    $mg_stem = "docs";
+    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
+    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
+    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
     # Tidy up stray files
 …
     &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
           "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
           "$phindexdir/phrases", "$phindexdir/mg-p.txt");
+          "$phindexdir/phrases", "$phindexdir/docs.txt");
     my $outfile = 1;
     while (-e "$phindexdir/outPhrase.$outfile") {
 …
+}
+# Execute a system command
+sub execute {
+    my ($command, $verbosity) = @_;
+    print "Executing: $command\n" if $verbosity;
+    my $status = system($command);
+    if ($status != 0) {
+    print STDERR "phindgen.pl - Error executing $command: $!\n";
+    exit($status);
+    }
+}
 …
 sub build_clauses {
     my ($archive_dir, $phindex_dir, $language, $verbosity, $maxdocs) = @_;
+    my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
     # create a "pluginfo" for ArcPlug and RecPlug
 …
     # create a phind document processor object to process the documents
     my $processor = new phproc ($archive_dir, $phindex_dir, $language,
+    my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
                 $doclimit, $verbosity, "STDOUT");
 …
     # Sort the phrases into order of increasing frequency
+    # This means the expansions will be sorted correctly later on.
     print "Sorting phrases into freq order\n" if ($verbosity);
     system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
 …
+    #
     # The phrases file looks something like this
+    # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
+    # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
+    # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
+    # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
+    #  159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
+    #  159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
+    #  159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
+    #  159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
+    # The first field on each line is a unique phrase identifier.
+    # We need to calculate phrase numbers for each phrase
     print "Calculate phrase numbers\n" if ($verbosity);
-    open(IN, "<$phindex_dir/phrases");
     my %phrasenumber;
     my $nextphrase = 1;
     my ($line, $num);
+    my ($line);
+    open(IN, "<$phindex_dir/phrases");
     while(<IN>) {
 …
     $line = $_;
     # we're only interested in ther first field
+    # we're only interested in the first field
     $line =~ s/:.*//;
     # get a phrase number for this line
+    $num = $nextphrase;
+    $phrasenumber{$line} = $num;
+    $phrasenumber{$line} = $nextphrase;
     $nextphrase++;
+    }
+    # Extract the phrase data
+    print "Create phrase file and frequency file\n" if ($verbosity);
+    # Now we create a new phrase file using phrase numbers, not the old IDs.
+    print "Format phrase data for MGPP\n" if ($verbosity);
     open(IN, "<$phindex_dir/phrases");
+    open(DATA, ">$phindex_dir/mg-p.txt");
+    my ($key, $tf, $countexp, $expansions, $countdocs, $documents, $text);
+    open(DATA, ">$phindex_dir/pdata.txt");
+    open(IDX, ">$phindex_dir/pword.txt");
+    my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
     my @fields;
     my @documents;
 …
     @fields = split(/:/, $line);
-    # output the MG document tag
-    print DATA "<Document>";
     # get a phrase number for this line
     $key = shift @fields;
     die unless (defined($phrasenumber{$key}));
     $num = $phrasenumber{$key};
-    print DATA "$num:";
     # get the text of the phrase
     $text = shift @fields;
     $text =~ s/s(\d+)/$symbol[$1]/g;
+    print DATA "$text:";
+    if ($text =~ / /) {
+        $word = "";
+    } else {
+        $word = $text;
+    }
     $linenumber++;
     if ($linenumber % 1000 == 0) {
 …
     # get the phrase frequency
     $tf = shift @fields;
-    print DATA "$tf:";
     # get the number of expansions
     $countexp = shift @fields;
-    print DATA "$countexp:";
     # get the expansions and convert them into phrase numbers
 …
         push @newexp, $n;
+    }
     print DATA join(",", (sort numerically @newexp)), ":";
+    @newexp = sort numerically @newexp;
     # get the number of documents
     $countdocs = shift @fields;
-    print DATA "$countdocs:";
     # get the documents
 …
     $documents =~ s/d//g;
     @documents = split(/;/, $documents);
+    print DATA join(";", (sort by_frequency @documents));
+    print DATA "\n";
+    @documents = sort by_frequency @documents;
+    # output the phrase data
+    print DATA "<Document>";
+    print DATA "$num:$text:$tf:$countexp:$countdocs:";
+    print DATA join(",", @newexp), ":", join(";", @documents), "\n";
+    # output the word index search data
+    print IDX "<Document>$word\n";
+    }

trunk/gsdl/src/phind/generate/phproc.pm

-              r1562
+              r1604
 sub new {
     my ($class, $archive_dir, $phindex_dir,
+    my ($class, $archive_dir, $phindex_dir, $phindcfg,
     $language, $delimiter, $verbosity, $outhandle) = @_;
     my $self = new docproc ();
-    # $self->{'collection'} = $collection;
     $self->{'archive_dir'} = $archive_dir;
     $self->{'phindex_dir'} = $phindex_dir;
+    $self->{'indexes'} = $phindcfg;
     $language =~ s/,/\|/g;
 …
     $self->{'txthandle'} = TEXT;
+    &util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt");
+    open(DOCS, ">$phindex_dir/mg-d.txt")
+    || die "Cannot open $phindex_dir/mg-d.txt: $!";
+    my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
+    &util::rm($docfile) if (-e $docfile);
+    open(DOCS, ">$docfile")
+    || die "Cannot open $docfile: $!";
     $self->{'dochandle'} = DOCS;
 …
     my $dochandle = $self->{'dochandle'};
     # print "dochandle: =$dochandle=\n";
+    print $dochandle "$OID\t$title\n";
+    # store the text
+    $text = convert_gml_to_tokens($doc_obj->get_text());
+    print $dochandle "<Document>\t$OID\t$title\n";
+    # XXX
+    # Store the text of this object
+    my $indexlist = $self->{'indexes'};
+    my @parts;
+    my ($index, $part, $level, $field, $section, $data, $text);
+    # Output the document delimiter
     my $txthandle = $self->{'txthandle'};
+    print $txthandle $self->{'delimiter'}, "\n$text\n";
+    print $txthandle $self->{'delimiter'}, "\n";
+    # Iterarate over all the indexes specified in collect.cfg and
+    # add their text to the clauses file.
+    foreach $index (@$indexlist) {
+    $text = "";
+    # Iterate over all the feilds in each index
+    @parts = split(/,/, $index);
+    foreach $part (@parts) {
+        # Each field has a level and a data element ((e.g. document:Title)
+        ($level, $field) = split(/:/, $part);
+        die unless ($level && $field);
+        # Extract the text from every section
+        # (In phind, document:text and section:text are equivalent)
+        if ($field eq "text") {
+        $data = "";
+        $section = $doc_obj->get_top_section();
+        while (defined($section)) {
+            $data .= $doc_obj->get_text($section) . "\n";
+            $section = $doc_obj->get_next_section($section);
+        }
+        $text .= convert_gml_to_tokens($data) . "\n";
+        }
+        # Extract a metadata field from a document
+        elsif ($level eq "document") {
+        $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
+        $text .= convert_gml_to_tokens($data) . "\n";
+        }
+        # Extract metadata from every section in a document
+        elsif ($level eq "section") {
+        $data = "";
+        $section = $doc_obj->get_top_section();
+        while (defined($section)) {
+            $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
+            $section = $doc_obj->get_next_section($section);
+        }
+        $text .= convert_gml_to_tokens($data) . "\n";
+        }
+        # Some sort of specification which I don't understand
+        else {
+        die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
+        }
+    }
+    # print the text
+    print $txthandle "$text";
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1604 for trunk/gsdl/src/phind

Legend:

trunk/gsdl/src/phind/generate/phindgen.pl

trunk/gsdl/src/phind/generate/phproc.pm

Download in other formats: