Changeset 28392


Ignore:
Timestamp:
2013-10-13T23:35:33+13:00 (11 years ago)
Author:
davidb
Message:

Simple version that looks for a hardwired title as metadata

Location:
gs2-extensions/apache-jena/trunk/src/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuilder.pm

    r28391 r28392  
    2828
    2929use strict;
    30 ##no strict 'refs'; # allow filehandles to be variables and viceversa
     30no strict 'refs'; # allow filehandles to be variables and viceversa
    3131
    3232use extrabuilder;
  • gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm

    r28391 r28392  
    3030
    3131use strict;
    32 #no strict 'refs'; # allow filehandles to be variables and viceversa
     32no strict 'refs'; # allow filehandles to be variables and viceversa
    3333
    3434use util;
     
    5151
    5252
     53
     54
    5355sub textedit {
     56    my $self = shift (@_);
     57    my ($doc_obj) = @_;
     58    my $handle = $self->{'output_handle'};
     59   
     60    my $doc_oid = $doc_obj->get_OID();
     61
     62    my $doc_section = 0; # just for this document
     63
     64    my $text = "";
     65    my $text_extra = "";
     66
     67    # get the text for this document
     68    my $section = $doc_obj->get_top_section();
     69    while (defined $section) {
     70    # update a few statistics
     71    $doc_section++;
     72
     73    my $title = $doc_obj->get_metadata_element($section, "Title");
     74
     75    if (defined $title && ($title =~ m/\S/)) {
     76        print "$doc_oid: Title = $title\n";
     77    }
     78
     79    my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title");
     80
     81    if (defined $dc_title && ($dc_title =~ m/\S/)) {
     82        print "$doc_oid: dc.Title = $dc_title\n";
     83    }
     84
     85
     86    my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title");
     87
     88    if (defined $id3_title && ($id3_title =~ m/\S/)) {
     89        print "$doc_oid: id3.Title = $id3_title\n";
     90    }
     91   
     92    $section = $doc_obj->get_next_section($section);
     93    }
     94
     95    print $handle "$text$text_extra";
     96}
     97
     98
     99
     100sub texteditADB {
    54101    my $self = shift (@_);
    55102    my ($doc_obj,$file,$mode) = @_;
     
    123170
    124171
    125 
    126 sub MGtext {
    127     my $self = shift (@_);
    128     my ($doc_obj) = @_;
    129     my $handle = $self->{'output_handle'};
    130    
    131     # only output this document if it is one to be indexed
    132     return if ($doc_obj->get_doc_type() ne "indexed_doc");
    133    
    134     # see if this document belongs to this subcollection
    135     my $indexed_doc = $self->is_subcollection_doc($doc_obj);
    136 
    137     # this is another document
    138     $self->{'num_docs'} += 1;
    139 
    140     # get the parameters for the output
    141     my ($level, $fields) = split (/:/, $self->{'index'});
    142     $fields =~ s/\ball\b/Title,Creator,text/;
    143     $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
    144 
    145     my $doc_section = 0; # just for this document
    146     my $text = "";
    147     my $text_extra = "";
    148 
    149     # get the text for this document
    150     my $section = $doc_obj->get_top_section();
    151     while (defined $section) {
    152     # update a few statistics
    153     $doc_section++;
    154     $self->{'num_sections'} += 1;
    155 
    156     my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
    157     if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {
    158         $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    159         foreach my $field (split (/,/, $fields)) {
    160         # only deal with this field if it doesn't start with top or
    161         # this is the first section
    162         my $real_field = $field;
    163         if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
    164             my $new_text = "";
    165             if ($level eq "dummy") {
    166             # a dummy index is a special case used when no
    167             # indexes are specified (since there must always be
    168             # at least one index or we can't retrieve the
    169             # compressed text) - we add a small amount of text
    170             # to these dummy indexes which will never be seen
    171             # but will overcome mg's problems with building
    172             # empty indexes
    173             $new_text = "this is dummy text to stop mg barfing";
    174             $self->{'num_processed_bytes'} += length ($new_text);
    175 
    176             } elsif ($real_field eq "text") {
    177             $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
    178             $self->{'num_processed_bytes'} += length ($new_text);
    179             $new_text =~ s/[\cB\cC]//g;
    180             $self->find_paragraphs($new_text);
    181            
    182             } else {
    183             my $first = 1;
    184             $real_field =~ s/^ex\.([^.]+)$/$1/; # remove ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
    185             my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
    186             if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
    187                 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
    188                 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
    189                 }
    190             }
    191             foreach my $meta (@section_metadata) {
    192                 $meta =~ s/[\cB\cC]//g;
    193                 $self->{'num_processed_bytes'} += length ($meta);
    194                 $new_text .= "\cC" unless $first;
    195                 $new_text .= $meta if $self->{'store_text'};
    196                 $first = 0;
    197             }
    198             }
    199            
    200             # filter the text
    201             $new_text = $self->filter_text ($field, $new_text);
    202 
    203             $text .= "$new_text\cC";
    204         }
    205         }
    206     }
    207    
    208     if ($level eq "document") { $text_extra .= "\cB"; }
    209     else { $text .= "\cB"; }
    210    
    211     $section = $doc_obj->get_next_section($section);
    212     }
    213 
    214     print $handle "$text$text_extra";
    215 }
    216 
    217 
    2181721;
Note: See TracChangeset for help on using the changeset viewer.