Changeset 2901


Ignore:
Timestamp:
2002-01-14T17:38:47+13:00 (20 years ago)
Author:
jrm21
Message:

We now interprete some latex commands in the input, mostly to do with accents.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BibTexPlug.pm

    r2484 r2901  
    88#
    99# Copyright 2000 Gordon W. Paynter
    10 # Copyright 1999-2000 New Zealand Digital Library Project
     10# Copyright 1999-2001 New Zealand Digital Library Project
    1111#
    1212# This program is free software; you can redistribute it and/or modify
     
    3535# It is a subclass of SplitPlug, so if there are multiple records, all
    3636# are read.
    37 
     37#
     38# Modified Dec 2001 by John McPherson:
     39#  *  some modifications submitted by Sergey Yevtushenko
     40#                      <sergey@intellektik.informatik.tu-darmstadt.de>
     41#  *  some non-ascii char support (ie mostly Latin)
     42#  *  The raw ascii bibtex entry is stored as "BibTex" metadata.
    3843
    3944package BibTexPlug;
    4045
    4146use SplitPlug;
    42 
    4347
    4448# BibTexPlug is a sub-class of BasPlug.
     
    5660    return q^\n+(?=@)^;
    5761}
     62
     63
    5864
    5965# The process function reads a single bibliographic record and stores
     
    8086    # This hash translates BibTex field names into metadata names.  The
    8187    # BibTex names are taken from the "Local Guide to Latex" Graeme
    82     # McKinstry.  Metadata names are consistabnt with ReferPlug.
     88    # McKinstry.  Metadata names are consistent with ReferPlug.
    8389
    8490    my %field = (
    8591         'address', 'PublisherAddress',
    8692         'author', 'Creator',
     93         
    8794         'booktitle', 'Booktitle',
    8895         'chapter', 'Chapter',
    8996         'edition', 'Edition',
    90          'editor', 'Editor',
     97         'editor', 'Editor', 
    9198         'institution', 'Publisher',
    9299         'journal', 'Journal',
     
    102109         'keywords', 'Keywords',
    103110         'abstract', 'Abstract',
    104          'copyright', 'Copyright');
     111         'copyright', 'Copyright'
     112);
    105113
    106114    # Metadata fields
     
    108116    my ($EntryType, $EntryID, $Creator, $Keywords, $text);
    109117
     118    my $verbosity = $self->{'verbosity'};
     119    $verbosity = 0 unless $verbosity;
     120
     121    my $lines=$$textref;
     122
    110123    # Make sure the text has exactly one entry per line
    111     my $lines = $$textref;
    112     $lines =~ s/,\s*\n/=====/g;
     124
     125    $lines  =~ s/^\s*(\@[^,]+,)\s*\n/$1=====/; #splitting key in entry
     126    $lines =~ s/([\"\}]\s*,)\s*\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
     127    $lines =~ s/(\d+\s*\,)\s*\n/$1=====/g; #for the case, when we have number entry without closing "
     128    $lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside  quoted strings,
     129                                 #that continues for several lines
    113130    $lines =~ s/\s+/ /g;
    114131    $lines =~ s/\s*=====\s*/\n/g;
    115     my @lines = split(/\n+/, $lines);
    116    
     132
     133    my @all_lines = split(/\n+/, $lines);
     134
    117135    # Read and process each line in the bib file.
    118     my ($id, $name, $value, $line);
    119     foreach $line (@lines) {
    120    
     136    my ($entryname, $name, $value, $line);
     137    foreach $line (@all_lines) {
     138
    121139    # Add each line.  Most lines consist of a field identifer and
    122140    # then data, and we simply store them, though we treat some
     
    126144    $text .= "$line\n";
    127145
    128    
     146    print "Processing line = $line \n" if $verbosity>=4;
     147
    129148    # The first line is special, it contains the reference type and OID
    130149    if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
     
    137156    }
    138157    if ($line =~ /\@/) {
    139         print "bibtexplug: suspect line in bibtex file: $line\n"
     158        print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
    140159        if ($verbosity >= 2);
    141         print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
     160        print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
    142161        if ($verbosity >= 2);
    143162    }
     
    145164    # otherwise, parse the metadata out of this line
    146165    next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
    147     $id = lc($1);
     166    $entryname = lc($1);
    148167    $value = $2;
    149    
     168    # tidy up, removing " at start and end 
     169    $value =~ s/^"//;
     170    $value =~ s/(",)\s*$//;
     171    $value = &process_latex($value);
     172
    150173    # Add this line of metadata
    151     $metadata{$id} .= "$value\n";
     174    $metadata{$entryname} .= "$value\n";
     175       
    152176    }
    153177
    154178    # Add the Entry type as metadata
    155     $doc_obj->add_metadata ($cursection, "EntryType", $EntryType);
     179    $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
    156180
    157181    # Add the various field as metadata
    158     foreach my $id (keys %metadata) {
     182    foreach my $entryname (keys %metadata) {
     183    next unless (defined $field{$entryname});
     184    next unless (defined $metadata{$entryname});   
    159185   
    160     next unless (defined $field{$id});
    161     next unless (defined $metadata{$id});   
    162    
    163     $name = $field{$id};
    164     $value = $metadata{$id};
    165 
    166     # Get rid of silly Latex stuff
    167     if ($value =~ /\"(.*)\"/) {
    168         $value = $1;
    169     }
    170     if ($value =~ /\{(.*)\}/) {
    171         $value = $1;
    172     }
    173        
    174     # Add the various field as metadata
    175     $value = &text_into_html($value);
    176     $doc_obj->add_metadata ($cursection, $name, $value);
     186    $name = $field{$entryname};
     187    $value = $metadata{$entryname};
     188
     189    # Add the various fields as metadata   
     190    my $html_value = &text_into_html($value);
     191    $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
    177192
    178193    # Several special operatons on metadata follow
     
    181196    # The full set of keywords will be added, in due course, as "Keywords".
    182197    # However, we also want to add them as individual "Keyword" metadata elements.
    183     if ($id eq "keywords") {
     198    if ($entryname eq "keywords") {
    184199        my @keywordlist = split(/,/, $value);
    185200        foreach my $k (@keywordlist) {
    186         $k = lc($k);
     201        $k = lc($k); 
    187202        $k =~ s/\s*$//;
    188203        $k =~ s/^\s*//;
    189204        if ($k =~ /\w/) {
    190205            $k = &text_into_html($k);
    191             $doc_obj->add_metadata ($cursection, "Keyword", $k);
     206            $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
    192207        }
    193208        }
     
    198213    # also want to split it into several individual "Author" fields in
    199214    # "Lastename, Firstnames" format so we can browse it.
    200     if ($id eq "author") {
    201        
    202         my @authorlist = split(/(,|and)/, $value);
     215    if ($entryname eq "author") { #added also comparison with editor
     216       
     217        # und here for german language...
     218        # don't use brackets in pattern, else the matched bit becomes
     219        # an element in the list!
     220        my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
    203221        foreach $a (@authorlist) {
    204222        $a =~ s/\s*$//;
    205223        $a =~ s/^\s*//;
    206        
    207224        # Reformat and add author name
     225        next if $a=~ /^\s*$/;
    208226        my @words = split(/ /, $a);
    209227        my $lastname = pop @words;
    210228        my $firstname = join(" ",  @words);
     229
    211230        my $fullname = $lastname . ", " . $firstname;
    212        
     231
    213232        # Add each name to set of Authors
     233        # force utf8 pragma so that \w matches in this scope
     234        use utf8;
    214235        if ($fullname =~ /\w+, \w+/) {
    215             $fullname = &text_into_html($fullname);
    216             $doc_obj->add_metadata ($cursection, "Author", $fullname);
     236           $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
    217237        }
    218238        }
     
    220240
    221241    # Books and Journals are additionally marked for display purposes
    222     if ($id eq "booktitle") {
    223         $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
    224     } elsif ($id eq "journal") {
    225         $doc_obj->add_metadata($cursection, "JournalsOnly", 1);
     242    if ($entryname eq "booktitle") {
     243        $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
     244    } elsif ($entryname eq "journal") {
     245        $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
    226246    }
    227247
     
    231251    if ($text =~ /\w/) {
    232252    $text = &text_into_html($text);
    233     $doc_obj->add_text ($cursection, $text);
     253    $doc_obj->add_utf8_text ($cursection, $text);
     254    $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
    234255    }
    235256
     
    238259
    239260
     261
     262
     263# convert email addresses and URLs into links
     264sub convert_urls_into_links{
     265   my ($text) = @_;
     266 
     267   $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
     268   $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
     269
     270   return $text;
     271}
     272
     273# Clean up whitespace and convert \n charaters to <BR> or <P>
     274sub clean_up_whitespaces{
     275   my ($text) = @_;
     276
     277   $text =~ s/%%%%%/<BR> <BR>/g;
     278   $text =~ s/ +/ /g;
     279   $text =~ s/\s*$//;
     280   $text =~ s/^\s*//;
     281   $text =~ s/\n/\n<BR>/g;
     282   $text =~ s/<BR>\s*<BR>/<P>/g;
     283
     284   return $text;
     285}
     286
     287
     288sub convert_problem_characters_without_ampersand{
     289    my ($text) = @_;
     290    $text =~ s/</&lt;/g;
     291    $text =~ s/>/&gt;/g;
     292   
     293    $text =~ s/\'\'/\"/g; #Latex -specific conversion
     294    $text =~ s/\`\`/\"/g; #Latex -specific conversion
     295
     296
     297    $text =~ s/\"/&quot;/g;
     298    $text =~ s/\'/&#8217;/g;
     299    $text =~ s/\`/&#8216;/g;
     300    $text =~ s/\+/ /g;
     301    $text =~ s/\(/ /g;
     302    $text =~ s/\)/ /g;
     303
     304    $text =~ s/\\/\\\\/g;
     305
     306    $text =~ s/\./\\\./g;
     307
     308    return $text;
     309}
     310
    240311# Convert a text string into HTML.
    241312
    242313# The HTML is going to be inserted into a GML file, so we have to be
    243 # careful not to use symbols like ">", which ocurs frequently in email
     314# careful not to use symbols like ">", which occurs frequently in email
    244315# messages (and use &gt instead.
    245316
     
    248319# with <P> tags).
    249320
    250 
    251321sub text_into_html {
    252322    my ($text) = @_;
    253323
    254 
    255     # Convert problem charaters into HTML symbols
     324    # Convert problem characters into HTML symbols
    256325    $text =~ s/&/&amp;/g;
    257     $text =~ s/</&lt;/g;
    258     $text =~ s/>/&gt;/g;
    259     $text =~ s/\"/&quot;/g;
    260     $text =~ s/\'/ /g;
    261     $text =~ s/\+/ /g;
    262     $text =~ s/\(/ /g;
    263     $text =~ s/\)/ /g;
     326
     327    $text = &convert_problem_characters_without_ampersand( $text );
    264328
    265329    # convert email addresses and URLs into links
    266     $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    267     $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
    268 
    269     # Clean up whitespace and convert \n charaters to <BR> or <P>
    270     $text =~ s/ +/ /g;
    271     $text =~ s/\s*$//;
    272     $text =~ s/^\s*//;
    273     $text =~ s/\n/\n<BR>/g;
    274     $text =~ s/<BR>\s*<BR>/<P>/g;
     330    $text = &convert_urls_into_links( $text );
     331
     332    $text = &clean_up_whitespaces( $text );
    275333
    276334    return $text;
    277335}
     336
     337
     338
     339
     340# Convert accented characters, remove { }, interprete some commands....
     341# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
     342sub process_latex {
     343    my ($text) = @_;
     344     
     345    # note - this is really ugly, but it works. There may be a prettier way
     346    # of mapping latex accented chars to utf8, but we just brute force it here.
     347    # Also, this isn't complete - not every single possible accented letter
     348    # is in here yet, but most of the common ones are.
     349
     350    my %utf8_chars =
     351    (
     352     # acutes
     353     '\'a' => chr(0xc3).chr(0xa1),
     354     '\'c' => chr(0xc4).chr(0x87),
     355     '\'e' => chr(0xc3).chr(0xa9),
     356     '\'i' => chr(0xc3).chr(0xad),
     357     '\'l' => chr(0xc3).chr(0xba),
     358     '\'n' => chr(0xc3).chr(0x84),
     359     '\'o' => chr(0xc3).chr(0xb3),
     360     '\'r' => chr(0xc5).chr(0x95),
     361     '\'s' => chr(0xc5).chr(0x9b),
     362     '\'u' => chr(0xc3).chr(0xba),
     363     '\'y' => chr(0xc3).chr(0xbd),
     364     '\'z' => chr(0xc5).chr(0xba),
     365     # graves
     366     '`a' => chr(0xc3).chr(0xa0),
     367     '`A' => chr(0xc3).chr(0x80),
     368     '`e' => chr(0xc3).chr(0xa8),
     369     '`E' => chr(0xc3).chr(0x88),
     370     '`i' => chr(0xc3).chr(0xac),
     371     '`I' => chr(0xc3).chr(0x8c),
     372     '`o' => chr(0xc3).chr(0xb2),
     373     '`O' => chr(0xc3).chr(0x92),
     374     '`u' => chr(0xc3).chr(0xb9),
     375     '`U' => chr(0xc3).chr(0x99),
     376     # circumflex
     377     '^a' => chr(0xc3).chr(0xa2),
     378     '^A' => chr(0xc3).chr(0x82),
     379     '^c' => chr(0xc4).chr(0x89),
     380     '^C' => chr(0xc4).chr(0x88),
     381     '^e' => chr(0xc3).chr(0xaa),
     382     '^E' => chr(0xc3).chr(0x8a),
     383     '^g' => chr(0xc4).chr(0x9d),
     384     '^G' => chr(0xc4).chr(0x9c),
     385     '^h' => chr(0xc4).chr(0xa5),
     386     '^H' => chr(0xc4).chr(0xa4),
     387     '^i' => chr(0xc3).chr(0xae),
     388     '^I' => chr(0xc3).chr(0x8e),
     389     '^j' => chr(0xc4).chr(0xb5),
     390     '^J' => chr(0xc4).chr(0xb4),
     391     '^o' => chr(0xc3).chr(0xb4),
     392     '^O' => chr(0xc3).chr(0x94),
     393     '^s' => chr(0xc5).chr(0x9d),
     394     '^S' => chr(0xc5).chr(0x9c),
     395     '^u' => chr(0xc3).chr(0xa2),
     396     '^U' => chr(0xc3).chr(0xbb),
     397     '^w' => chr(0xc5).chr(0xb5),
     398     '^W' => chr(0xc5).chr(0xb4),
     399     '^y' => chr(0xc5).chr(0xb7),
     400     '^Y' => chr(0xc5).chr(0xb6),
     401     
     402     # diaeresis
     403     '"a' => chr(0xc3).chr(0xa4),
     404     '"A' => chr(0xc3).chr(0x84),
     405     '"e' => chr(0xc3).chr(0xab),
     406     '"E' => chr(0xc3).chr(0x8b),
     407     '"\\\\i' => chr(0xc3).chr(0xaf),
     408     '"\\\\I' => chr(0xc3).chr(0x8f),
     409     '"o' => chr(0xc3).chr(0xb6),
     410     '"O' => chr(0xc3).chr(0x96),
     411     '"u' => chr(0xc3).chr(0xbc),
     412     '"U' => chr(0xc3).chr(0x9c),
     413     '"y' => chr(0xc3).chr(0xbf),
     414     '"Y' => chr(0xc3).chr(0xb8),
     415     # tilde
     416     # caron - handled specially
     417#      ',s' => chr(0xc5).chr(0xa1),
     418#      ',S' => chr(0xc5).chr(0xa5),
     419     # breve
     420     # double acute
     421     # ring
     422     # dot
     423     # macron
     424     '=a' => chr(0xc4).chr(0x81),
     425     '=A' => chr(0xc4).chr(0x80),
     426     '=e' => chr(0xc4).chr(0x93),
     427     '=E' => chr(0xc4).chr(0x92),
     428     '=i' => chr(0xc4).chr(0xab),
     429     '=I' => chr(0xc4).chr(0xaa),
     430     '=o' => chr(0xc4).chr(0x8d),
     431     '=O' => chr(0xc4).chr(0x8c),
     432     '=u' => chr(0xc4).chr(0xab),
     433     '=U' => chr(0xc4).chr(0xaa),
     434     
     435     # stroke - handled specially - see below
     436     
     437     # cedilla - handled specially
     438     
     439     );
     440   
     441# these are one letter latex commands - we make sure they're not a longer
     442# command name. eg {\d} is d+stroke, so careful of \d
     443    my %special_utf8_chars =
     444    (
     445     # caron
     446     'v n' => chr(0xc5).chr(0x88),
     447     'v N' => chr(0xc5).chr(0x87),
     448     'v s' => chr(0xc5).chr(0xa1),
     449     'v S' => chr(0xc5).chr(0xa5),
     450     # cedilla
     451     'c c' => chr(0xc3).chr(0xa7),
     452     'c C' => chr(0xc3).chr(0x87),
     453     'c g' => chr(0xc4).chr(0xa3),
     454     'c G' => chr(0xc4).chr(0xa2),
     455     'c k' => chr(0xc4).chr(0xb7),
     456     'c K' => chr(0xc4).chr(0xb6),
     457     'c l' => chr(0xc4).chr(0xbc),
     458     'c L' => chr(0xc4).chr(0xbb),
     459     'c n' => chr(0xc5).chr(0x86),
     460     'c N' => chr(0xc5).chr(0x85),
     461     'c r' => chr(0xc5).chr(0x97),
     462     'c R' => chr(0xc5).chr(0x96),
     463     'c s' => chr(0xc5).chr(0x9f),
     464     'c S' => chr(0xc5).chr(0x9e),
     465     'c t' => chr(0xc5).chr(0xa3),
     466     'c T' => chr(0xc5).chr(0xa2),
     467     # double acute / Hungarian accent
     468     'H O' => chr(0xc5).chr(0x90),
     469     'H o' => chr(0xc5).chr(0x91),
     470     'H U' => chr(0xc5).chr(0xb0),
     471     'H u' => chr(0xc5).chr(0xb1),
     472     
     473     # stroke
     474     'd' => chr(0xc4).chr(0x91),
     475     'D' => chr(0xc4).chr(0x90),
     476     'h' => chr(0xc4).chr(0xa7),
     477#    'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
     478     'l' => chr(0xc5).chr(0x82),
     479     'L' => chr(0xc5).chr(0x81),
     480     'o' => chr(0xc3).chr(0xb8),
     481     'O' => chr(0xc3).chr(0x98),
     482     't' => chr(0xc5).chr(0xa7),
     483     'T' => chr(0xc5).chr(0xa6),
     484     # german ss/szlig/sharp s
     485     'ss' => chr(0xc3).chr(0x9f),
     486     );
     487   
     488    # convert latex-style accented characters.
     489    # remove space (if any) between \ and letter to accent (eg {\' a})
     490
     491    $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
     492
     493    # remove {} around a single character (eg \'{e})
     494    $text =~ s@(\\[`'="]){(\w)}@$1$2@;
     495
     496    # remove {} around a single character for special 1 letter commands -
     497    # need to insert a space. Eg \v{s}  ->  {\v s}
     498    $text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
     499
     500    # this is slow (go through whole hash for each substitution!) so
     501    # only do if the text contains a '\' character.
     502    if ($text =~ m|\\|) {
     503      for $latex_code (keys %utf8_chars) {
     504      $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
     505      }
     506
     507      # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
     508      # only do the change if immediately followed by a space, }, {, or \
     509      for $latex_code (keys %special_utf8_chars) {
     510      $text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
     511      }
     512    }
     513
     514    # remove latex groupings { } (but not \{ or \} )
     515    # note - need it like this for first char match - eg {xx}{yy}
     516    while ($text =~ s@([^\\]){([^}]*?[^\\])}@$1$2@g) {}
     517
     518    # remove latex commands
     519    $text =~ s@\\\w+{(.*)}@$1@g;
     520
     521    # maths mode $...$ - this is not interpreted in any way at the moment...
     522    $text =~ s@\$(.*)\$@$1@g;
     523
     524    # quoted { } chars
     525    $text =~ s@\\{@{@g;
     526    $text =~ s@\\}@}@g;
     527
     528    return $text;
     529}
     530
    278531
    279532sub set_OID {
Note: See TracChangeset for help on using the changeset viewer.