Show
Ignore:
Timestamp:
13.10.2013 23:35:33 (6 years ago)
Author:
davidb
Message:

Simple version that looks for a hardwired title as metadata

Location:
gs2-extensions/apache-jena/trunk/src/perllib
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuilder.pm

    r28391 r28392  
    2828 
    2929use strict; 
    30 ##no strict 'refs'; # allow filehandles to be variables and viceversa 
     30no strict 'refs'; # allow filehandles to be variables and viceversa 
    3131 
    3232use extrabuilder; 
  • gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm

    r28391 r28392  
    3030 
    3131use strict;  
    32 #no strict 'refs'; # allow filehandles to be variables and viceversa 
     32no strict 'refs'; # allow filehandles to be variables and viceversa 
    3333 
    3434use util; 
     
    5151 
    5252 
     53 
     54 
    5355sub textedit { 
     56    my $self = shift (@_); 
     57    my ($doc_obj) = @_; 
     58    my $handle = $self->{'output_handle'}; 
     59     
     60    my $doc_oid = $doc_obj->get_OID(); 
     61 
     62    my $doc_section = 0; # just for this document 
     63 
     64    my $text = ""; 
     65    my $text_extra = ""; 
     66 
     67    # get the text for this document 
     68    my $section = $doc_obj->get_top_section(); 
     69    while (defined $section) { 
     70    # update a few statistics 
     71    $doc_section++; 
     72 
     73    my $title = $doc_obj->get_metadata_element($section, "Title"); 
     74 
     75    if (defined $title && ($title =~ m/\S/)) { 
     76        print "$doc_oid: Title = $title\n"; 
     77    } 
     78 
     79    my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title"); 
     80 
     81    if (defined $dc_title && ($dc_title =~ m/\S/)) { 
     82        print "$doc_oid: dc.Title = $dc_title\n"; 
     83    } 
     84 
     85 
     86    my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title"); 
     87 
     88    if (defined $id3_title && ($id3_title =~ m/\S/)) { 
     89        print "$doc_oid: id3.Title = $id3_title\n"; 
     90    } 
     91     
     92    $section = $doc_obj->get_next_section($section); 
     93    } 
     94 
     95    print $handle "$text$text_extra"; 
     96} 
     97 
     98 
     99 
     100sub texteditADB { 
    54101    my $self = shift (@_); 
    55102    my ($doc_obj,$file,$mode) = @_; 
     
    123170 
    124171 
    125  
    126 sub MGtext { 
    127     my $self = shift (@_); 
    128     my ($doc_obj) = @_; 
    129     my $handle = $self->{'output_handle'}; 
    130      
    131     # only output this document if it is one to be indexed 
    132     return if ($doc_obj->get_doc_type() ne "indexed_doc"); 
    133      
    134     # see if this document belongs to this subcollection 
    135     my $indexed_doc = $self->is_subcollection_doc($doc_obj); 
    136  
    137     # this is another document 
    138     $self->{'num_docs'} += 1; 
    139  
    140     # get the parameters for the output 
    141     my ($level, $fields) = split (/:/, $self->{'index'}); 
    142     $fields =~ s/\ball\b/Title,Creator,text/; 
    143     $fields =~ s/\btopall\b/topTitle,topCreator,toptext/; 
    144  
    145     my $doc_section = 0; # just for this document 
    146     my $text = ""; 
    147     my $text_extra = ""; 
    148  
    149     # get the text for this document 
    150     my $section = $doc_obj->get_top_section(); 
    151     while (defined $section) { 
    152     # update a few statistics 
    153     $doc_section++; 
    154     $self->{'num_sections'} += 1; 
    155  
    156     my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section"; 
    157     if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) { 
    158         $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 
    159         foreach my $field (split (/,/, $fields)) { 
    160         # only deal with this field if it doesn't start with top or 
    161         # this is the first section 
    162         my $real_field = $field; 
    163         if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 
    164             my $new_text = ""; 
    165             if ($level eq "dummy") { 
    166             # a dummy index is a special case used when no 
    167             # indexes are specified (since there must always be 
    168             # at least one index or we can't retrieve the 
    169             # compressed text) - we add a small amount of text 
    170             # to these dummy indexes which will never be seen 
    171             # but will overcome mg's problems with building 
    172             # empty indexes 
    173             $new_text = "this is dummy text to stop mg barfing"; 
    174             $self->{'num_processed_bytes'} += length ($new_text); 
    175  
    176             } elsif ($real_field eq "text") { 
    177             $new_text = $doc_obj->get_text ($section) if $self->{'store_text'}; 
    178             $self->{'num_processed_bytes'} += length ($new_text); 
    179             $new_text =~ s/[\cB\cC]//g; 
    180             $self->find_paragraphs($new_text); 
    181              
    182             } else { 
    183             my $first = 1; 
    184             $real_field =~ s/^ex\.([^.]+)$/$1/; # remove ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact) 
    185             my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)}; 
    186             if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { 
    187                 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) { 
    188                 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)}); 
    189                 } 
    190             } 
    191             foreach my $meta (@section_metadata) { 
    192                 $meta =~ s/[\cB\cC]//g; 
    193                 $self->{'num_processed_bytes'} += length ($meta); 
    194                 $new_text .= "\cC" unless $first; 
    195                 $new_text .= $meta if $self->{'store_text'}; 
    196                 $first = 0; 
    197             } 
    198             } 
    199              
    200             # filter the text 
    201             $new_text = $self->filter_text ($field, $new_text); 
    202  
    203             $text .= "$new_text\cC"; 
    204         } 
    205         } 
    206     } 
    207      
    208     if ($level eq "document") { $text_extra .= "\cB"; } 
    209     else { $text .= "\cB"; } 
    210      
    211     $section = $doc_obj->get_next_section($section); 
    212     } 
    213  
    214     print $handle "$text$text_extra"; 
    215 } 
    216  
    217  
    2181721;