Changeset 8795


Ignore:
Timestamp:
2004-12-14T12:31:58+13:00 (19 years ago)
Author:
kjdon
Message:

if use_sections is on, now we are a bit more relaxed about what the sections are split on. used to be <a name=, now its <[Aa] name=, and instead of matching \d+ for the title, we use \"?\w+\"?, cos sometimes you get <a name="outline">
And since we can now have non-digit titles in the document, we explicitly set gsdlthistype to be paged, otherwise greenstone will treat it as hierarchical if there are non-digit titles.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/PDFPlug.pm

    r8716 r8795  
    150150
    151151    # we have "<a name=1></a>" etc for each page
    152     my @sections = split('<a name=', $$textref);
     152    # it may be <A name=
     153    my @sections = split('<[Aa] name=', $$textref);
    153154
    154155    if (scalar (@sections) == 1) { #only one section - no split!
     
    161162   
    162163    my $title = $sections[0];
    163     $title =~ s/^\d+>//; # specific for pdftohtml...
     164    $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
    164165    $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
    165166    $title =~ s/<[^>]*>/ /g;
     
    183184    # add metadata per section...
    184185    foreach my $section (@sections) {
    185         $section =~ s@^(\d+)></a>@@; # leftover from split expression...
     186        # section names are not always just digits, may be like "outline"
     187        $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
    186188
    187189        $title = $1; # Greenstone does magic if sections are titled digits
    188190        if (! defined($title) ) {
    189191        print STDERR "no title: $section\n";
     192        $title = " "; # get rid of the undefined warning in next line
    190193        }
    191194        my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
     
    237240
    238241    # Add NumPages metadata (we have "<a name=1>" etc for each page)
    239     my @pages = ($$textref =~ /\<a name=\d+\>/ig);
     242    my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
    240243    $doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
    241 
     244   
     245    if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
     246    # we explicitly make it a paged document, cos greenstone won't get it
     247    # right if any section has an empty title, or one with letters in it
     248    $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
     249    }
    242250    return $result;
    243251}
Note: See TracChangeset for help on using the changeset viewer.