Ignore:
Timestamp:
2000-10-17T12:35:59+13:00 (24 years ago)
Author:
paynter
Message:

Numerous improvements for use with the new phindcgi script. The main ones
are that three MGPP databases are now created (document data, phrase data,
and word search) and that the data extracted from each document is set
explicitly in the collection configuration file (usually it will be
something like document:text or section:Title).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phproc.pm

    r1562 r1604  
    3838
    3939sub new {
    40     my ($class, $archive_dir, $phindex_dir,
     40    my ($class, $archive_dir, $phindex_dir, $phindcfg,
    4141    $language, $delimiter, $verbosity, $outhandle) = @_;
    4242
    4343    my $self = new docproc ();
    4444   
    45     # $self->{'collection'} = $collection;
    4645    $self->{'archive_dir'} = $archive_dir;
    4746    $self->{'phindex_dir'} = $phindex_dir;
     47    $self->{'indexes'} = $phindcfg;
    4848
    4949    $language =~ s/,/\|/g;
     
    6060    $self->{'txthandle'} = TEXT;
    6161
    62     &util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt");
    63     open(DOCS, ">$phindex_dir/mg-d.txt")
    64     || die "Cannot open $phindex_dir/mg-d.txt: $!";
     62    my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
     63    &util::rm($docfile) if (-e $docfile);
     64    open(DOCS, ">$docfile")
     65    || die "Cannot open $docfile: $!";
    6566    $self->{'dochandle'} = DOCS;
    6667
     
    9596    my $dochandle = $self->{'dochandle'};
    9697    # print "dochandle: =$dochandle=\n";
    97     print $dochandle "$OID\t$title\n";
    98    
    99     # store the text
    100     $text = convert_gml_to_tokens($doc_obj->get_text());
    101 
     98    print $dochandle "<Document>\t$OID\t$title\n";
     99   
     100    # XXX
     101    # Store the text of this object
     102    my $indexlist = $self->{'indexes'};
     103    my @parts;
     104    my ($index, $part, $level, $field, $section, $data, $text);
     105
     106    # Output the document delimiter
    102107    my $txthandle = $self->{'txthandle'};
    103     print $txthandle $self->{'delimiter'}, "\n$text\n";
     108    print $txthandle $self->{'delimiter'}, "\n";
     109
     110    # Iterarate over all the indexes specified in collect.cfg and
     111    # add their text to the clauses file.
     112    foreach $index (@$indexlist) {
     113    $text = "";
     114
     115    # Iterate over all the feilds in each index
     116    @parts = split(/,/, $index);
     117    foreach $part (@parts) {
     118
     119        # Each field has a level and a data element ((e.g. document:Title)
     120        ($level, $field) = split(/:/, $part);
     121        die unless ($level && $field);
     122
     123        # Extract the text from every section
     124        # (In phind, document:text and section:text are equivalent)
     125        if ($field eq "text") {
     126        $data = "";
     127        $section = $doc_obj->get_top_section();
     128        while (defined($section)) {
     129            $data .= $doc_obj->get_text($section) . "\n";
     130            $section = $doc_obj->get_next_section($section);
     131        }
     132        $text .= convert_gml_to_tokens($data) . "\n";
     133        }
     134
     135        # Extract a metadata field from a document
     136        elsif ($level eq "document") {
     137        $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
     138        $text .= convert_gml_to_tokens($data) . "\n";
     139        }
     140       
     141        # Extract metadata from every section in a document
     142        elsif ($level eq "section") {
     143        $data = "";
     144        $section = $doc_obj->get_top_section();
     145        while (defined($section)) {
     146            $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
     147            $section = $doc_obj->get_next_section($section);
     148        }
     149        $text .= convert_gml_to_tokens($data) . "\n";
     150        }
     151       
     152        # Some sort of specification which I don't understand
     153        else {
     154        die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
     155        }
     156
     157    }
     158
     159    # print the text
     160    print $txthandle "$text";
     161
     162    }
    104163}
    105164
Note: See TracChangeset for help on using the changeset viewer.