Changeset 28392
- Timestamp:
- 2013-10-13T23:35:33+13:00 (10 years ago)
- Location:
- gs2-extensions/apache-jena/trunk/src/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuilder.pm
r28391 r28392 28 28 29 29 use strict; 30 ##no strict 'refs'; # allow filehandles to be variables and viceversa30 no strict 'refs'; # allow filehandles to be variables and viceversa 31 31 32 32 use extrabuilder; -
gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm
r28391 r28392 30 30 31 31 use strict; 32 #no strict 'refs'; # allow filehandles to be variables and viceversa32 no strict 'refs'; # allow filehandles to be variables and viceversa 33 33 34 34 use util; … … 51 51 52 52 53 54 53 55 sub textedit { 56 my $self = shift (@_); 57 my ($doc_obj) = @_; 58 my $handle = $self->{'output_handle'}; 59 60 my $doc_oid = $doc_obj->get_OID(); 61 62 my $doc_section = 0; # just for this document 63 64 my $text = ""; 65 my $text_extra = ""; 66 67 # get the text for this document 68 my $section = $doc_obj->get_top_section(); 69 while (defined $section) { 70 # update a few statistics 71 $doc_section++; 72 73 my $title = $doc_obj->get_metadata_element($section, "Title"); 74 75 if (defined $title && ($title =~ m/\S/)) { 76 print "$doc_oid: Title = $title\n"; 77 } 78 79 my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title"); 80 81 if (defined $dc_title && ($dc_title =~ m/\S/)) { 82 print "$doc_oid: dc.Title = $dc_title\n"; 83 } 84 85 86 my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title"); 87 88 if (defined $id3_title && ($id3_title =~ m/\S/)) { 89 print "$doc_oid: id3.Title = $id3_title\n"; 90 } 91 92 $section = $doc_obj->get_next_section($section); 93 } 94 95 print $handle "$text$text_extra"; 96 } 97 98 99 100 sub texteditADB { 54 101 my $self = shift (@_); 55 102 my ($doc_obj,$file,$mode) = @_; … … 123 170 124 171 125 126 sub MGtext {127 my $self = shift (@_);128 my ($doc_obj) = @_;129 my $handle = $self->{'output_handle'};130 131 # only output this document if it is one to be indexed132 return if ($doc_obj->get_doc_type() ne "indexed_doc");133 134 # see if this document belongs to this subcollection135 my $indexed_doc = $self->is_subcollection_doc($doc_obj);136 137 # this is another document138 $self->{'num_docs'} += 1;139 140 # get the parameters for the output141 my ($level, $fields) = split (/:/, $self->{'index'});142 $fields =~ s/\ball\b/Title,Creator,text/;143 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;144 145 my $doc_section = 0; # just for this document146 my $text = "";147 my $text_extra = "";148 149 # get the text for this document150 my $section = $doc_obj->get_top_section();151 while (defined $section) {152 # update a few statistics153 $doc_section++;154 $self->{'num_sections'} += 1;155 156 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";157 if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {158 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);159 foreach my $field (split (/,/, $fields)) {160 # only deal with this field if it doesn't start with top or161 # this is the first section162 my $real_field = $field;163 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {164 my $new_text = "";165 if ($level eq "dummy") {166 # a dummy index is a special case used when no167 # indexes are specified (since there must always be168 # at least one index or we can't retrieve the169 # compressed text) - we add a small amount of text170 # to these dummy indexes which will never be seen171 # but will overcome mg's problems with building172 # empty indexes173 $new_text = "this is dummy text to stop mg barfing";174 $self->{'num_processed_bytes'} += length ($new_text);175 176 } elsif ($real_field eq "text") {177 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};178 $self->{'num_processed_bytes'} += length ($new_text);179 $new_text =~ s/[\cB\cC]//g;180 $self->find_paragraphs($new_text);181 182 } else {183 my $first = 1;184 $real_field =~ s/^ex\.([^.]+)$/$1/; # remove ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)185 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};186 if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {187 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {188 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});189 }190 }191 foreach my $meta (@section_metadata) {192 $meta =~ s/[\cB\cC]//g;193 $self->{'num_processed_bytes'} += length ($meta);194 $new_text .= "\cC" unless $first;195 $new_text .= $meta if $self->{'store_text'};196 $first = 0;197 }198 }199 200 # filter the text201 $new_text = $self->filter_text ($field, $new_text);202 203 $text .= "$new_text\cC";204 }205 }206 }207 208 if ($level eq "document") { $text_extra .= "\cB"; }209 else { $text .= "\cB"; }210 211 $section = $doc_obj->get_next_section($section);212 }213 214 print $handle "$text$text_extra";215 }216 217 218 172 1;
Note:
See TracChangeset
for help on using the changeset viewer.