Context Navigation

← Previous Changeset
Next Changeset →

Changeset 2901

Timestamp:

2002-01-14T17:38:47+13:00 (22 years ago)

Author:

jrm21

Message:

We now interprete some latex commands in the input, mostly to do with accents.

File:

: 1 edited

trunk/gsdl/perllib/plugins/BibTexPlug.pm (modified) (15 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BibTexPlug.pm

-              r2484
+              r2901
+#
 # Copyright 2000 Gordon W. Paynter
 # Copyright 1999-2000 New Zealand Digital Library Project
+# Copyright 1999-2001 New Zealand Digital Library Project
+#
 # This program is free software; you can redistribute it and/or modify
 …
 # It is a subclass of SplitPlug, so if there are multiple records, all
 # are read.
+#
+# Modified Dec 2001 by John McPherson:
+#  *  some modifications submitted by Sergey Yevtushenko
+#                      <[email protected]>
+#  *  some non-ascii char support (ie mostly Latin)
+#  *  The raw ascii bibtex entry is stored as "BibTex" metadata.
 package BibTexPlug;
 use SplitPlug;
 # BibTexPlug is a sub-class of BasPlug.
 …
     return q^\n+(?=@)^;
+}
 # The process function reads a single bibliographic record and stores
 …
     # This hash translates BibTex field names into metadata names.  The
     # BibTex names are taken from the "Local Guide to Latex" Graeme
     # McKinstry.  Metadata names are consistabnt with ReferPlug.
+    # McKinstry.  Metadata names are consistent with ReferPlug.
     my %field = (
          'address', 'PublisherAddress',
          'author', 'Creator',
          'booktitle', 'Booktitle',
          'chapter', 'Chapter',
          'edition', 'Edition',
          'editor', 'Editor',
+         'editor', 'Editor',
          'institution', 'Publisher',
          'journal', 'Journal',
 …
          'keywords', 'Keywords',
          'abstract', 'Abstract',
+         'copyright', 'Copyright');
+         'copyright', 'Copyright'
+);
     # Metadata fields
 …
     my ($EntryType, $EntryID, $Creator, $Keywords, $text);
+    my $verbosity = $self->{'verbosity'};
+    $verbosity = 0 unless $verbosity;
+    my $lines=$$textref;
     # Make sure the text has exactly one entry per line
+    my $lines = $$textref;
+    $lines =~ s/,\s*\n/=====/g;
+    $lines  =~ s/^\s*(\@[^,]+,)\s*\n/$1=====/; #splitting key in entry
+    $lines =~ s/([\"\}]\s*,)\s*\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
+    $lines =~ s/(\d+\s*\,)\s*\n/$1=====/g; #for the case, when we have number entry without closing "
+    $lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside  quoted strings,
+                                 #that continues for several lines
     $lines =~ s/\s+/ /g;
     $lines =~ s/\s*=====\s*/\n/g;
+    my @lines = split(/\n+/, $lines);
+    my @all_lines = split(/\n+/, $lines);
     # Read and process each line in the bib file.
     my ($id, $name, $value, $line);
     foreach $line (@lines) {
+    my ($entryname, $name, $value, $line);
+    foreach $line (@all_lines) {
     # Add each line.  Most lines consist of a field identifer and
     # then data, and we simply store them, though we treat some
 …
     $text .= "$line\n";
+    print "Processing line = $line \n" if $verbosity>=4;
     # The first line is special, it contains the reference type and OID
     if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
 …
+    }
     if ($line =~ /\@/) {
         print "bibtexplug: suspect line in bibtex file: $line\n"
+        print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
         if ($verbosity >= 2);
         print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
+        print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
         if ($verbosity >= 2);
+    }
 …
     # otherwise, parse the metadata out of this line
     next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
     $id = lc($1);
+    $entryname = lc($1);
     $value = $2;
+    # tidy up, removing " at start and end
+    $value =~ s/^"//;
+    $value =~ s/(",)\s*$//;
+    $value = &process_latex($value);
     # Add this line of metadata
+    $metadata{$id} .= "$value\n";
+    $metadata{$entryname} .= "$value\n";
+    }
     # Add the Entry type as metadata
     $doc_obj->add_metadata ($cursection, "EntryType", $EntryType);
+    $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
     # Add the various field as metadata
+    foreach my $id (keys %metadata) {
+    foreach my $entryname (keys %metadata) {
+    next unless (defined $field{$entryname});
+    next unless (defined $metadata{$entryname});
+    next unless (defined $field{$id});
+    next unless (defined $metadata{$id});
+    $name = $field{$id};
+    $value = $metadata{$id};
+    # Get rid of silly Latex stuff
+    if ($value =~ /\"(.*)\"/) {
+        $value = $1;
+    }
+    if ($value =~ /\{(.*)\}/) {
+        $value = $1;
+    }
+    # Add the various field as metadata
+    $value = &text_into_html($value);
+    $doc_obj->add_metadata ($cursection, $name, $value);
+    $name = $field{$entryname};
+    $value = $metadata{$entryname};
+    # Add the various fields as metadata
+    my $html_value = &text_into_html($value);
+    $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
     # Several special operatons on metadata follow
 …
     # The full set of keywords will be added, in due course, as "Keywords".
     # However, we also want to add them as individual "Keyword" metadata elements.
     if ($id eq "keywords") {
+    if ($entryname eq "keywords") {
         my @keywordlist = split(/,/, $value);
         foreach my $k (@keywordlist) {
         $k = lc($k);
+        $k = lc($k);
         $k =~ s/\s*$//;
         $k =~ s/^\s*//;
         if ($k =~ /\w/) {
             $k = &text_into_html($k);
             $doc_obj->add_metadata ($cursection, "Keyword", $k);
+            $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
+        }
+        }
 …
     # also want to split it into several individual "Author" fields in
     # "Lastename, Firstnames" format so we can browse it.
+    if ($id eq "author") {
+        my @authorlist = split(/(,|and)/, $value);
+    if ($entryname eq "author") { #added also comparison with editor
+        # und here for german language...
+        # don't use brackets in pattern, else the matched bit becomes
+        # an element in the list!
+        my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
         foreach $a (@authorlist) {
         $a =~ s/\s*$//;
         $a =~ s/^\s*//;
         # Reformat and add author name
+        next if $a=~ /^\s*$/;
         my @words = split(/ /, $a);
         my $lastname = pop @words;
         my $firstname = join(" ",  @words);
         my $fullname = $lastname . ", " . $firstname;
         # Add each name to set of Authors
+        # force utf8 pragma so that \w matches in this scope
+        use utf8;
         if ($fullname =~ /\w+, \w+/) {
+            $fullname = &text_into_html($fullname);
+            $doc_obj->add_metadata ($cursection, "Author", $fullname);
+           $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
+        }
+        }
 …
     # Books and Journals are additionally marked for display purposes
     if ($id eq "booktitle") {
         $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
     } elsif ($id eq "journal") {
         $doc_obj->add_metadata($cursection, "JournalsOnly", 1);
+    if ($entryname eq "booktitle") {
+        $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
+    } elsif ($entryname eq "journal") {
+        $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
+    }
 …
     if ($text =~ /\w/) {
     $text = &text_into_html($text);
+    $doc_obj->add_text ($cursection, $text);
+    $doc_obj->add_utf8_text ($cursection, $text);
+    $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
+    }
 …
+# convert email addresses and URLs into links
+sub convert_urls_into_links{
+   my ($text) = @_;
+   $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
+   $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
+   return $text;
+}
+# Clean up whitespace and convert \n charaters to <BR> or <P>
+sub clean_up_whitespaces{
+   my ($text) = @_;
+   $text =~ s/%%%%%/<BR> <BR>/g;
+   $text =~ s/ +/ /g;
+   $text =~ s/\s*$//;
+   $text =~ s/^\s*//;
+   $text =~ s/\n/\n<BR>/g;
+   $text =~ s/<BR>\s*<BR>/<P>/g;
+   return $text;
+}
+sub convert_problem_characters_without_ampersand{
+    my ($text) = @_;
+    $text =~ s/</&lt;/g;
+    $text =~ s/>/&gt;/g;
+    $text =~ s/\'\'/\"/g; #Latex -specific conversion
+    $text =~ s/\`\`/\"/g; #Latex -specific conversion
+    $text =~ s/\"/&quot;/g;
+    $text =~ s/\'/&#8217;/g;
+    $text =~ s/\`/&#8216;/g;
+    $text =~ s/\+/ /g;
+    $text =~ s/\(/ /g;
+    $text =~ s/\)/ /g;
+    $text =~ s/\\/\\\\/g;
+    $text =~ s/\./\\\./g;
+    return $text;
+}
 # Convert a text string into HTML.
 # The HTML is going to be inserted into a GML file, so we have to be
 # careful not to use symbols like ">", which ocurs frequently in email
+# careful not to use symbols like ">", which occurs frequently in email
 # messages (and use &gt instead.
 …
 # with <P> tags).
 sub text_into_html {
     my ($text) = @_;
+    # Convert problem charaters into HTML symbols
+    # Convert problem characters into HTML symbols
     $text =~ s/&/&amp;/g;
+    $text =~ s/</&lt;/g;
+    $text =~ s/>/&gt;/g;
+    $text =~ s/\"/&quot;/g;
+    $text =~ s/\'/ /g;
+    $text =~ s/\+/ /g;
+    $text =~ s/\(/ /g;
+    $text =~ s/\)/ /g;
+    $text = &convert_problem_characters_without_ampersand( $text );
     # convert email addresses and URLs into links
+    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
+    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
+    # Clean up whitespace and convert \n charaters to <BR> or <P>
+    $text =~ s/ +/ /g;
+    $text =~ s/\s*$//;
+    $text =~ s/^\s*//;
+    $text =~ s/\n/\n<BR>/g;
+    $text =~ s/<BR>\s*<BR>/<P>/g;
+    $text = &convert_urls_into_links( $text );
+    $text = &clean_up_whitespaces( $text );
     return $text;
+}
+# Convert accented characters, remove { }, interprete some commands....
+# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
+sub process_latex {
+    my ($text) = @_;
+    # note - this is really ugly, but it works. There may be a prettier way
+    # of mapping latex accented chars to utf8, but we just brute force it here.
+    # Also, this isn't complete - not every single possible accented letter
+    # is in here yet, but most of the common ones are.
+    my %utf8_chars =
+    (
+     # acutes
+     '\'a' => chr(0xc3).chr(0xa1),
+     '\'c' => chr(0xc4).chr(0x87),
+     '\'e' => chr(0xc3).chr(0xa9),
+     '\'i' => chr(0xc3).chr(0xad),
+     '\'l' => chr(0xc3).chr(0xba),
+     '\'n' => chr(0xc3).chr(0x84),
+     '\'o' => chr(0xc3).chr(0xb3),
+     '\'r' => chr(0xc5).chr(0x95),
+     '\'s' => chr(0xc5).chr(0x9b),
+     '\'u' => chr(0xc3).chr(0xba),
+     '\'y' => chr(0xc3).chr(0xbd),
+     '\'z' => chr(0xc5).chr(0xba),
+     # graves
+     '`a' => chr(0xc3).chr(0xa0),
+     '`A' => chr(0xc3).chr(0x80),
+     '`e' => chr(0xc3).chr(0xa8),
+     '`E' => chr(0xc3).chr(0x88),
+     '`i' => chr(0xc3).chr(0xac),
+     '`I' => chr(0xc3).chr(0x8c),
+     '`o' => chr(0xc3).chr(0xb2),
+     '`O' => chr(0xc3).chr(0x92),
+     '`u' => chr(0xc3).chr(0xb9),
+     '`U' => chr(0xc3).chr(0x99),
+     # circumflex
+     '^a' => chr(0xc3).chr(0xa2),
+     '^A' => chr(0xc3).chr(0x82),
+     '^c' => chr(0xc4).chr(0x89),
+     '^C' => chr(0xc4).chr(0x88),
+     '^e' => chr(0xc3).chr(0xaa),
+     '^E' => chr(0xc3).chr(0x8a),
+     '^g' => chr(0xc4).chr(0x9d),
+     '^G' => chr(0xc4).chr(0x9c),
+     '^h' => chr(0xc4).chr(0xa5),
+     '^H' => chr(0xc4).chr(0xa4),
+     '^i' => chr(0xc3).chr(0xae),
+     '^I' => chr(0xc3).chr(0x8e),
+     '^j' => chr(0xc4).chr(0xb5),
+     '^J' => chr(0xc4).chr(0xb4),
+     '^o' => chr(0xc3).chr(0xb4),
+     '^O' => chr(0xc3).chr(0x94),
+     '^s' => chr(0xc5).chr(0x9d),
+     '^S' => chr(0xc5).chr(0x9c),
+     '^u' => chr(0xc3).chr(0xa2),
+     '^U' => chr(0xc3).chr(0xbb),
+     '^w' => chr(0xc5).chr(0xb5),
+     '^W' => chr(0xc5).chr(0xb4),
+     '^y' => chr(0xc5).chr(0xb7),
+     '^Y' => chr(0xc5).chr(0xb6),
+     # diaeresis
+     '"a' => chr(0xc3).chr(0xa4),
+     '"A' => chr(0xc3).chr(0x84),
+     '"e' => chr(0xc3).chr(0xab),
+     '"E' => chr(0xc3).chr(0x8b),
+     '"\\\\i' => chr(0xc3).chr(0xaf),
+     '"\\\\I' => chr(0xc3).chr(0x8f),
+     '"o' => chr(0xc3).chr(0xb6),
+     '"O' => chr(0xc3).chr(0x96),
+     '"u' => chr(0xc3).chr(0xbc),
+     '"U' => chr(0xc3).chr(0x9c),
+     '"y' => chr(0xc3).chr(0xbf),
+     '"Y' => chr(0xc3).chr(0xb8),
+     # tilde
+     # caron - handled specially
+#      ',s' => chr(0xc5).chr(0xa1),
+#      ',S' => chr(0xc5).chr(0xa5),
+     # breve
+     # double acute
+     # ring
+     # dot
+     # macron
+     '=a' => chr(0xc4).chr(0x81),
+     '=A' => chr(0xc4).chr(0x80),
+     '=e' => chr(0xc4).chr(0x93),
+     '=E' => chr(0xc4).chr(0x92),
+     '=i' => chr(0xc4).chr(0xab),
+     '=I' => chr(0xc4).chr(0xaa),
+     '=o' => chr(0xc4).chr(0x8d),
+     '=O' => chr(0xc4).chr(0x8c),
+     '=u' => chr(0xc4).chr(0xab),
+     '=U' => chr(0xc4).chr(0xaa),
+     # stroke - handled specially - see below
+     # cedilla - handled specially
+     );
+# these are one letter latex commands - we make sure they're not a longer
+# command name. eg {\d} is d+stroke, so careful of \d
+    my %special_utf8_chars =
+    (
+     # caron
+     'v n' => chr(0xc5).chr(0x88),
+     'v N' => chr(0xc5).chr(0x87),
+     'v s' => chr(0xc5).chr(0xa1),
+     'v S' => chr(0xc5).chr(0xa5),
+     # cedilla
+     'c c' => chr(0xc3).chr(0xa7),
+     'c C' => chr(0xc3).chr(0x87),
+     'c g' => chr(0xc4).chr(0xa3),
+     'c G' => chr(0xc4).chr(0xa2),
+     'c k' => chr(0xc4).chr(0xb7),
+     'c K' => chr(0xc4).chr(0xb6),
+     'c l' => chr(0xc4).chr(0xbc),
+     'c L' => chr(0xc4).chr(0xbb),
+     'c n' => chr(0xc5).chr(0x86),
+     'c N' => chr(0xc5).chr(0x85),
+     'c r' => chr(0xc5).chr(0x97),
+     'c R' => chr(0xc5).chr(0x96),
+     'c s' => chr(0xc5).chr(0x9f),
+     'c S' => chr(0xc5).chr(0x9e),
+     'c t' => chr(0xc5).chr(0xa3),
+     'c T' => chr(0xc5).chr(0xa2),
+     # double acute / Hungarian accent
+     'H O' => chr(0xc5).chr(0x90),
+     'H o' => chr(0xc5).chr(0x91),
+     'H U' => chr(0xc5).chr(0xb0),
+     'H u' => chr(0xc5).chr(0xb1),
+     # stroke
+     'd' => chr(0xc4).chr(0x91),
+     'D' => chr(0xc4).chr(0x90),
+     'h' => chr(0xc4).chr(0xa7),
+#    'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
+     'l' => chr(0xc5).chr(0x82),
+     'L' => chr(0xc5).chr(0x81),
+     'o' => chr(0xc3).chr(0xb8),
+     'O' => chr(0xc3).chr(0x98),
+     't' => chr(0xc5).chr(0xa7),
+     'T' => chr(0xc5).chr(0xa6),
+     # german ss/szlig/sharp s
+     'ss' => chr(0xc3).chr(0x9f),
+     );
+    # convert latex-style accented characters.
+    # remove space (if any) between \ and letter to accent (eg {\' a})
+    $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
+    # remove {} around a single character (eg \'{e})
+    $text =~ s@(\\[`'="]){(\w)}@$1$2@;
+    # remove {} around a single character for special 1 letter commands -
+    # need to insert a space. Eg \v{s}  ->  {\v s}
+    $text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
+    # this is slow (go through whole hash for each substitution!) so
+    # only do if the text contains a '\' character.
+    if ($text =~ m|\\|) {
+      for $latex_code (keys %utf8_chars) {
+      $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
+      }
+      # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
+      # only do the change if immediately followed by a space, }, {, or \
+      for $latex_code (keys %special_utf8_chars) {
+      $text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
+      }
+    }
+    # remove latex groupings { } (but not \{ or \} )
+    # note - need it like this for first char match - eg {xx}{yy}
+    while ($text =~ s@([^\\]){([^}]*?[^\\])}@$1$2@g) {}
+    # remove latex commands
+    $text =~ s@\\\w+{(.*)}@$1@g;
+    # maths mode $...$ - this is not interpreted in any way at the moment...
+    $text =~ s@\$(.*)\$@$1@g;
+    # quoted { } chars
+    $text =~ s@\\{@{@g;
+    $text =~ s@\\}@}@g;
+    return $text;
+}
 sub set_OID {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2901

Legend:

trunk/gsdl/perllib/plugins/BibTexPlug.pm

Download in other formats: