Context Navigation

← Previous Changeset
Next Changeset →

Changeset 9178

Timestamp:

2005-02-24T16:56:48+13:00 (19 years ago)

Author:

kjdon

Message:

tried to tidy up the paragraph tagging stuff, but its still not working - needs gs2:id att in the para tag to work properly with the indexing as is. needs to be thought about more

File:

: 1 edited

trunk/gsdl/perllib/lucenebuildproc.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/lucenebuildproc.pm

-              r8716
+              r9178
     my $lpar_level = $levels->{'paragraph'};
-    my $doc_idx_att = ($ldoc_level) ? "index=\"1\"" : "";
-    my $sec_idx_att = ($lsec_level) ? "index=\"1\"" : "";
-    my $par_idx_att = ($lpar_level) ? "index=\"1\"" : "";
     my $docid="";
     if ($ldoc_level) {
 …
     $sectiontag = $mgppbuildproc::level_map{'section'};
+    }
+    my ($paratag) = "";
+    my ($parastarttag) = "";
+    my ($paraendtag) = "";
     if ($self->{'levels'}->{'paragraph'}) {
     if ($self->{'strip_html'}) {
+        $paratag = "<". $mgppbuildproc::level_map{'paragraph'} . ">";
+        $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
+        $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
     } else {
         print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
+    }
+    }
     my $doc_section = 0; # just for this document
-##    my $text = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'; ## ****
     my $text = "";
     $text .= $documenttag;
     # get the text for this document
     my $section = $doc_obj->get_top_section();
 …
+    }
+    # if we are doing subcollections, then some docs shouldn't be indexed.
+    # but we need to put the section tag placeholders in there so the
+    # sections match up with gdbm db
     if ($indexed_doc) {
         if ($self->{'indexing_text'}) {
         $text .= "$paratag"; # only add para tags for indexing
+        #if ($self->{'indexing_text'}) {
+    #   $text .= "$parastarttag"; # only add para tags for indexing
         # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
+        }
+       # }
         $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
         foreach my $field (split (/,/, $fields)) {
 …
             if ($real_field eq "text") {
             if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
                 $new_text .= "$paratag<TX index=\"1\">\n";
+                $new_text .= "$parastarttag<TX index=\"1\">\n";
                 $tmp_text .= $doc_obj->get_text ($section);
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX index=\"1\">");
+                if ($parastarttag =~ "") {
+                # we don't want to individually tag each paragraph if not doing para indexing
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
+                } else {
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">");
+                }
                 $new_text .= "$tmp_text</TX>\n";
+                $new_text .= "$tmp_text</TX>$paraendtag\n";
                 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
                 #$self->{'indexfields'}->{'TextOnly'} = 1;
                 #}
+            }
+            else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
+                $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
+            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
+                if ($self->{'store_text'}) {
+                $tmp_text .= $doc_obj->get_text ($section);
+                $tmp_text =~ s/</&lt;/g;
+                $tmp_text =~ s/>/&gt;/g;
+                $new_text .= $tmp_text;
+                }
+            }
             } else { # metadata field
 …
                     $self->{'indexfieldmap'}->{$shortname} = 1;
+                    }
                     $new_text .= "$paratag<$shortname index=\"1\">$mvalue</$shortname>\n";
+                    $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
                     if (!defined $self->{'indexfields'}->{$mfield}) {
                     $self->{'indexfields'}->{$mfield} = 1;
 …
+                }
                 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
                 $new_text .= "$paratag<$shortname index=\"1\">$item</$shortname>\n";
+                $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
+                }
+            }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 9178

Legend:

trunk/gsdl/perllib/lucenebuildproc.pm

Download in other formats: