Context Navigation

← Previous Change
Next Change →

BibTexPlug.pm

Timestamp:

2002-07-12T15:19:17+12:00 (22 years ago)

Author:

jrm21

Message:

1) add a space when joining consecutive lines, just in case.

2) Don't use ',' to separate author names.

3) Proper name parsing: first, von, last, jr. And we modify it slightly for
the Creator metadata so it's a nice list with only one "and".

4) Proper Date metadata in the greenstone Date format yyyymmdd so that the
receptionist doesn't print out corrupted strings.

5) Don't create BibTex metadata, as it is exactly the same as the [Text].

6) Modified latex accent parsing, so it is faster - only substitute found
accents instead of old brute force of whole hash.

File:

: 1 edited

trunk/gsdl/perllib/plugins/BibTexPlug.pm (modified) (11 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BibTexPlug.pm

-              r3156
+              r3249
     } else {
         # this is a continuation of previous line
         $entry_line .= $input_line;
+        $entry_line .= " " . $input_line;
+    }
 …
         $value=expand_month($value);
+    }
-    # Add the various fields as metadata
-    my $html_value = &text_into_html($value);
-    $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
     # Several special operatons on metadata follow
 …
         $k =~ s/^\s*//;
         if ($k =~ /\w/) {
-            $k = &text_into_html($k);
             $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
+        }
 …
     if ($entryname eq "author") { #added also comparison with editor
+        # take care of "et al."...
+        $value =~ s/(\s+et\.?\s+al\.?)\s*$//;
+        my $etal=$1;
+        $etal="" if (!defined ($etal));
         # und here for german language...
         # don't use brackets in pattern, else the matched bit becomes
         # an element in the list!
+        my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
+        my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
+        my @formattedlist = ();
         foreach $a (@authorlist) {
         $a =~ s/\s*$//;
 …
         # Reformat and add author name
         next if $a=~ /^\s*$/;
+        my @words = split(/ /, $a);
+        my $lastname = pop @words;
+        my $firstname = join(" ",  @words);
+        my $fullname = $lastname . ", " . $firstname;
+        # names are "First von Last", "von Last, First"
+        # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
+        my $first="";
+        my $vonlast="";
+        my $jr="";
+        if ($a =~ /,/) {
+            my @parts=split(/,\s*/, $a);
+            $first = pop @parts;
+            if (scalar(@parts) == 2) {
+            $jr = pop @parts;
+            }
+            $vonlast=shift @parts;
+            if (scalar(@parts) > 0) {
+            print $outhandle "BibTexPlug: couldn't parse name $a\n";
+            # but we continue anyway...
+            }
+        } else { # First von Last
+            my @words = split(/ /, $a);
+            while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
+            $first .= " " . shift (@words);
+            }
+            $first =~ s/^\s//;
+            $vonlast = join (' ', @words); # whatever's left...
+        }
+        my $von="";
+        my $last="";
+        if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
+            $vonlast =~ s/^(([a-z]\w+\s+)+)//;
+            $von = $1;
+            if (!defined ($von)) {
+            # some non-English names do start with lowercase
+            # eg "Marie desJardins". Also we can get typos...
+            print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
+            $von="";
+            if ($vonlast =~ /^[a-z]+$/) {
+                # if it's all lowercase, uppercase 1st.
+                $vonlast =~ s/^(.)/\u$1/;
+            }
+            }
+            $von =~ s/\s*$//;
+            $last=$vonlast;
+        } else {
+            $last=$vonlast;
+        }
+        my $wholename="$first $von $last $jr";
+        $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
+        push (@formattedlist, $wholename);
+        my $fullname = "$last";
+        $fullname .= " $jr" if ($jr);
+        $fullname .= ", $first";
+        $fullname .= " $von" if ($von);
         # Add each name to set of Authors
         # force utf8 pragma so that \w matches in this scope
         use utf8;
+        if ($fullname =~ /\w+, \w+/) {
+            $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
+        } else {
+        }
+        $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
+        }
+        # Only want at most one "and" in the Creator field
+        if (scalar(@formattedlist) > 2) {
+        my $lastauthor=pop @formattedlist;
+        $value=join(', ', @formattedlist);
+        $value.=" and $lastauthor";
+        } else { # 1 or 2 authors...
+        $value=join(" and ",@formattedlist);
+        }
+        $value.=$etal; # if there was "et al."
+    }
 …
+    }
+    # Add the various fields as metadata
+    $doc_obj->add_utf8_metadata ($cursection, $name, $value);
+    }
+    # Add the text in BibTex format (all fields)
+    # Add Date (yyyymmdd) metadata
+    if (defined ($metadata{'year'}) ) {
+    my $date=$metadata{'year'};
+    chomp $date;
+    my $month=$metadata{'month'};
+    if (defined($month)) {
+        # month is currently 3 letter code or a range...
+        $month = expand_month($month);
+        # take the first month found... might not find one!
+        $month =~ m/_textmonth(\d\d)_/;
+        $month = $1;
+    }
+    if (!defined($month)) {
+        $month="00";
+    }
+    $date .= "${month}00";
+    $doc_obj->add_utf8_metadata($cursection, "Date", $date);
+}
+#    # Add the text in BibTex format (all fields)
     if ($text =~ /\w/) {
+    $text = &text_into_html($text);
+    $text =~ s@&@&amp;@g;
+    $text =~ s@<@&lt;@g;
+    $text =~ s@>@&gt;@g;
+    $text =~ s@\n@<br/>\n@g;
+    $text =~ s@\\@\\\\@g;
+# Not really required...
+#   $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
     $doc_obj->add_utf8_text ($cursection, $text);
-    $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
+    }
 …
     $text =~ s/\`\`/\"/g; #Latex -specific conversion
     $text =~ s/\"/&quot;/g;
     $text =~ s/\'/&#8217;/g;
     $text =~ s/\`/&#8216;/g;
+    $text =~ s/\+/ /g;
+    $text =~ s/\(/ /g;
+    $text =~ s/\)/ /g;
+#    $text =~ s/\+/ /g;
+#    $text =~ s/\(/ /g;
+#    $text =~ s/\)/ /g;
     $text =~ s/\\/\\\\/g;
     $text =~ s/\./\\\./g;
+#    $text =~ s/\./\\\./g;
     return $text;
 …
      '~o' => chr(0xc3).chr(0xb5),
      # caron - handled specially
-#      ',s' => chr(0xc5).chr(0xa1),
-#      ',S' => chr(0xc5).chr(0xa5),
      # double acute
      # ring
      # dot
      '\.c' => chr(0xc4).chr(0x8b),
      '\.C' => chr(0xc4).chr(0x8a),
      '\.e' => chr(0xc4).chr(0x97),
      '\.E' => chr(0xc4).chr(0x96),
      '\.g' => chr(0xc4).chr(0xa1),
      '\.G' => chr(0xc4).chr(0xa0),
      '\.I' => chr(0xc4).chr(0xb0),
      '\.z' => chr(0xc5).chr(0xbc),
      '\.Z' => chr(0xc5).chr(0xbb),
+     '.c' => chr(0xc4).chr(0x8b),
+     '.C' => chr(0xc4).chr(0x8a),
+     '.e' => chr(0xc4).chr(0x97),
+     '.E' => chr(0xc4).chr(0x96),
+     '.g' => chr(0xc4).chr(0xa1),
+     '.G' => chr(0xc4).chr(0xa0),
+     '.I' => chr(0xc4).chr(0xb0),
+     '.z' => chr(0xc5).chr(0xbc),
+     '.Z' => chr(0xc5).chr(0xbb),
      # macron
      '=a' => chr(0xc4).chr(0x81),
 …
      # cedilla - handled specially
      );
 …
     # convert latex-style accented characters.
     # remove space (if any) between \ and letter to accent (eg {\' a})
     $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
+    $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
     # remove {} around a single character (eg \'{e})
+    $text =~ s@(\\[`'="\.]){(\w)}@{$1$2}@g;
+    $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
+    # \, is another way of doing cedilla \c
+    $text =~ s@\\,(.)@\\c $1@g;
     # remove {} around a single character for special 1 letter commands -
     # need to insert a space. Eg \v{s}  ->  {\v s}
     $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
+    # this is slow (go through whole hash for each substitution!) so
     # only do if the text contains a '\' character.
     if ($text =~ m|\\|) {
+      for $latex_code (keys %utf8_chars) {
+      $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
+      }
+      # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
+      # only do the change if immediately followed by a space, }, {, or \
+      for $latex_code (keys %special_utf8_chars) {
+      $text =~ s/\\${latex_code}([\\\s{}])/$special_utf8_chars{$latex_code}$1/g;
+      }
+    # "normal" accents - ie non-alpha latex tag
+    while ($text =~ m@\\([`'="^~\.])([\w])@) {
+        my $tex="$1$2"; my $char="$2";
+        my $replacement=$utf8_chars{$tex};
+        if (!defined($replacement)) {
+        print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
+        $replacement=$char;
+        }
+        $text =~ s/\\$tex/$replacement/g;
+    }
+        # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
+        # only do the change if immediately followed by a space, }, {, or \
+    # one letter accents ( + ss)
+        while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
+        my $tex=$1;
+        my $replacement=$special_utf8_chars{$tex};
+        if (!defined($replacement)) {
+        print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
+        $replacement=$tex;
+        }
+        $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
+    }
+    # one letter latex accent commands that affect following letter
+        while ($text =~ m@\\([uvcH]) ([\w])@) {
+          my $tex="$1 $2"; my $char="$2";
+          my $replacement=$special_utf8_chars{$tex};
+          if (!defined($replacement)) {
+          print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
+          $replacement=$char;
+      }
+          $text =~ s/\\$tex/$replacement/g;
+        }
+    }
+    # escape html-sensitive characters
+    $text =~ s@&@&amp;@g;
+    $text =~ s@<@&lt;@g;
+    $text =~ s@>@&gt;@g;
+    $text =~ s/''/"/g; # Latex-specific
+    $text =~ s/``/"/g; # Latex-specific
+    # greenstone-specific
+    $text =~ s@\[@&\#91;@g;
+    $text =~ s@\]@&\#93;@g;
     # remove latex commands
+    # commands with optional arguments...
+    # explicitly recognised commands
+    $text =~ s@\\ldots@&hellip;@g;
+    # maths mode
+    $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
+    # remove all other commands with optional arguments...
     $text =~ s@\\\w+(\[.*?\])?\s*@@g;
     # $text =~ s@\\noopsort{[^}]+\}@@g;
 …
     # remove latex groupings { } (but not \{ or \} )
     while ($text =~ s/([^\\])[\{\}]/$1/g) {;}
+    while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
     $text =~ s/^\{//; # remove { if first char
-    # maths mode $...$ - this is not interpreted in any way at the moment...
-    $text =~ s@\$(.*)\$@$1@g;
     # latex characters
     # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
     $text =~ s/([^\\])~+/$1/g; # non-breaking space  "~"
+    $text =~ s/([^\\])~+/$1 /g; # non-breaking space  "~"
     # optional break "\-"
     if ($text =~ m/\#/) { # concat macros (bibtex)
+    if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
     # the non-macro bits have quotes around them - we just remove quotes
     $text =~ s/[\"\#]//g;
+    $text =~ s/([^&])[\"\#]/$1/g;
+    }
+    # dashes. Convert (m|n)-dash into single dash for html.
+    $text =~ s@\-\-+@\-@g;
     # quoted { } chars
     $text =~ s@\\{@{@g;
     $text =~ s@\\}@}@g;
+    # finally to protect against macro language...
+    $text =~ s@\\@\\\\@g;
     return $text;
+}
+sub process_latex_math {
+    my $text = shift;
+    $text =~ s@\\infty@infinity@g;       # or unicode 0x221E...
+    $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
+    $text =~ s@\^([^\{])@<sup>$1</sup>@g;
+    $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
+    $text =~ s@\_([^\{])@<sub>$1</sub>@g;
+    # put all other command names in italics
+    $text =~ s@\\([\w]+)@<i>$1</i>@g;
+    return $text;
+}
 sub set_OID {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 3249 for trunk/gsdl/perllib/plugins/BibTexPlug.pm

Legend:

trunk/gsdl/perllib/plugins/BibTexPlug.pm

Download in other formats: