Ignore:
Timestamp:
2002-07-12T15:19:17+12:00 (22 years ago)
Author:
jrm21
Message:

1) add a space when joining consecutive lines, just in case.

2) Don't use ',' to separate author names.

3) Proper name parsing: first, von, last, jr. And we modify it slightly for
the Creator metadata so it's a nice list with only one "and".

4) Proper Date metadata in the greenstone Date format yyyymmdd so that the
receptionist doesn't print out corrupted strings.

5) Don't create BibTex metadata, as it is exactly the same as the [Text].

6) Modified latex accent parsing, so it is faster - only substitute found
accents instead of old brute force of whole hash.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BibTexPlug.pm

    r3156 r3249  
    132132    } else {
    133133        # this is a continuation of previous line
    134         $entry_line .= $input_line;
     134        $entry_line .= " " . $input_line;
    135135    }
    136136   
     
    202202        $value=expand_month($value);
    203203    }
    204     # Add the various fields as metadata   
    205     my $html_value = &text_into_html($value);
    206     $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
    207204
    208205    # Several special operatons on metadata follow
     
    218215        $k =~ s/^\s*//;
    219216        if ($k =~ /\w/) {
    220             $k = &text_into_html($k);
    221217            $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
    222218        }
     
    230226    if ($entryname eq "author") { #added also comparison with editor
    231227       
     228        # take care of "et al."...
     229        $value =~ s/(\s+et\.?\s+al\.?)\s*$//;
     230        my $etal=$1;
     231        $etal="" if (!defined ($etal));
    232232        # und here for german language...
    233233        # don't use brackets in pattern, else the matched bit becomes
    234234        # an element in the list!
    235         my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
     235        my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
     236        my @formattedlist = ();
    236237        foreach $a (@authorlist) {
    237238        $a =~ s/\s*$//;
     
    239240        # Reformat and add author name
    240241        next if $a=~ /^\s*$/;
    241         my @words = split(/ /, $a);
    242         my $lastname = pop @words;
    243         my $firstname = join(" ",  @words);
    244 
    245         my $fullname = $lastname . ", " . $firstname;
     242       
     243        # names are "First von Last", "von Last, First"
     244        # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
     245        my $first="";
     246        my $vonlast="";
     247        my $jr="";
     248
     249        if ($a =~ /,/) {
     250            my @parts=split(/,\s*/, $a);
     251            $first = pop @parts;
     252            if (scalar(@parts) == 2) {
     253            $jr = pop @parts;
     254            }
     255            $vonlast=shift @parts;
     256            if (scalar(@parts) > 0) {
     257            print $outhandle "BibTexPlug: couldn't parse name $a\n";
     258            # but we continue anyway...
     259            }
     260        } else { # First von Last
     261            my @words = split(/ /, $a);
     262            while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
     263            $first .= " " . shift (@words);
     264            }
     265            $first =~ s/^\s//;
     266            $vonlast = join (' ', @words); # whatever's left...
     267        }
     268        my $von="";
     269        my $last="";
     270        if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
     271            $vonlast =~ s/^(([a-z]\w+\s+)+)//;
     272            $von = $1;
     273            if (!defined ($von)) {
     274            # some non-English names do start with lowercase
     275            # eg "Marie desJardins". Also we can get typos...
     276            print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
     277            $von="";
     278            if ($vonlast =~ /^[a-z]+$/) {
     279                # if it's all lowercase, uppercase 1st.
     280                $vonlast =~ s/^(.)/\u$1/;
     281
     282            }
     283            }
     284            $von =~ s/\s*$//;
     285            $last=$vonlast;
     286        } else {
     287            $last=$vonlast;
     288        }
     289        my $wholename="$first $von $last $jr";
     290        $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
     291        push (@formattedlist, $wholename);
     292        my $fullname = "$last";
     293        $fullname .= " $jr" if ($jr);
     294        $fullname .= ", $first";
     295        $fullname .= " $von" if ($von);
    246296
    247297        # Add each name to set of Authors
    248298        # force utf8 pragma so that \w matches in this scope
    249299        use utf8;
    250         if ($fullname =~ /\w+, \w+/) {
    251             $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
    252         } else {
    253         }
     300        $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
    254301        }
     302
     303        # Only want at most one "and" in the Creator field
     304        if (scalar(@formattedlist) > 2) {
     305        my $lastauthor=pop @formattedlist;
     306        $value=join(', ', @formattedlist);
     307        $value.=" and $lastauthor";
     308        } else { # 1 or 2 authors...
     309        $value=join(" and ",@formattedlist);
     310        }
     311        $value.=$etal; # if there was "et al."
    255312    }
    256313
     
    262319    }
    263320
     321
     322    # Add the various fields as metadata   
     323    $doc_obj->add_utf8_metadata ($cursection, $name, $value);
    264324    }
    265325
    266     # Add the text in BibTex format (all fields)
     326    # Add Date (yyyymmdd) metadata
     327    if (defined ($metadata{'year'}) ) {
     328    my $date=$metadata{'year'};
     329    chomp $date;
     330    my $month=$metadata{'month'};
     331    if (defined($month)) {
     332        # month is currently 3 letter code or a range...
     333        $month = expand_month($month);
     334        # take the first month found... might not find one!
     335        $month =~ m/_textmonth(\d\d)_/;
     336        $month = $1;
     337    }
     338    if (!defined($month)) {
     339        $month="00";
     340    }
     341    $date .= "${month}00";
     342    $doc_obj->add_utf8_metadata($cursection, "Date", $date);
     343}
     344
     345#    # Add the text in BibTex format (all fields)
    267346    if ($text =~ /\w/) {
    268     $text = &text_into_html($text);
     347
     348    $text =~ s@&@&@g;
     349    $text =~ s@<@&lt;@g;
     350    $text =~ s@>@&gt;@g;
     351    $text =~ s@\n@<br/>\n@g;
     352    $text =~ s@\\@\\\\@g;
     353
     354# Not really required...
     355#   $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
    269356    $doc_obj->add_utf8_text ($cursection, $text);
    270     $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
    271357    }
    272358
     
    310396    $text =~ s/\`\`/\"/g; #Latex -specific conversion
    311397
    312 
    313398    $text =~ s/\"/&quot;/g;
    314399    $text =~ s/\'/&#8217;/g;
    315400    $text =~ s/\`/&#8216;/g;
    316     $text =~ s/\+/ /g;
    317     $text =~ s/\(/ /g;
    318     $text =~ s/\)/ /g;
     401
     402#    $text =~ s/\+/ /g;
     403#    $text =~ s/\(/ /g;
     404#    $text =~ s/\)/ /g;
    319405
    320406    $text =~ s/\\/\\\\/g;
    321407
    322     $text =~ s/\./\\\./g;
     408#    $text =~ s/\./\\\./g;
    323409
    324410    return $text;
     
    457543     '~o' => chr(0xc3).chr(0xb5),
    458544     # caron - handled specially
    459 #      ',s' => chr(0xc5).chr(0xa1),
    460 #      ',S' => chr(0xc5).chr(0xa5),
    461545     # double acute
    462546     # ring
    463547     # dot
    464      '\.c' => chr(0xc4).chr(0x8b),
    465      '\.C' => chr(0xc4).chr(0x8a),
    466      '\.e' => chr(0xc4).chr(0x97),
    467      '\.E' => chr(0xc4).chr(0x96),
    468      '\.g' => chr(0xc4).chr(0xa1),
    469      '\.G' => chr(0xc4).chr(0xa0),
    470      '\.I' => chr(0xc4).chr(0xb0),
    471      '\.z' => chr(0xc5).chr(0xbc),
    472      '\.Z' => chr(0xc5).chr(0xbb),
     548     '.c' => chr(0xc4).chr(0x8b),
     549     '.C' => chr(0xc4).chr(0x8a),
     550     '.e' => chr(0xc4).chr(0x97),
     551     '.E' => chr(0xc4).chr(0x96),
     552     '.g' => chr(0xc4).chr(0xa1),
     553     '.G' => chr(0xc4).chr(0xa0),
     554     '.I' => chr(0xc4).chr(0xb0),
     555     '.z' => chr(0xc5).chr(0xbc),
     556     '.Z' => chr(0xc5).chr(0xbb),
    473557     # macron
    474558     '=a' => chr(0xc4).chr(0x81),
     
    486570     
    487571     # cedilla - handled specially
    488      
    489572     );
    490573   
     
    551634   
    552635    # convert latex-style accented characters.
     636
    553637    # remove space (if any) between \ and letter to accent (eg {\' a})
    554638
    555     $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
     639    $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
    556640
    557641    # remove {} around a single character (eg \'{e})
    558     $text =~ s@(\\[`'="\.]){(\w)}@{$1$2}@g;
     642    $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
     643
     644    # \, is another way of doing cedilla \c
     645    $text =~ s@\\,(.)@\\c $1@g;
    559646
    560647    # remove {} around a single character for special 1 letter commands -
    561648    # need to insert a space. Eg \v{s}  ->  {\v s}
    562649    $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
    563     # this is slow (go through whole hash for each substitution!) so
     650
    564651    # only do if the text contains a '\' character.
    565652    if ($text =~ m|\\|) {
    566       for $latex_code (keys %utf8_chars) {
    567       $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
    568       }
    569 
    570       # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
    571       # only do the change if immediately followed by a space, }, {, or \
    572       for $latex_code (keys %special_utf8_chars) {
    573       $text =~ s/\\${latex_code}([\\\s{}])/$special_utf8_chars{$latex_code}$1/g;
    574       }
     653    # "normal" accents - ie non-alpha latex tag
     654    while ($text =~ m@\\([`'="^~\.])([\w])@) {
     655        my $tex="$1$2"; my $char="$2";
     656        my $replacement=$utf8_chars{$tex};
     657        if (!defined($replacement)) {
     658        print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
     659        $replacement=$char;
     660        }
     661        $text =~ s/\\$tex/$replacement/g;
     662    }
     663
     664        # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
     665        # only do the change if immediately followed by a space, }, {, or \
     666    # one letter accents ( + ss)
     667        while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
     668        my $tex=$1;
     669        my $replacement=$special_utf8_chars{$tex};
     670        if (!defined($replacement)) {
     671        print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
     672        $replacement=$tex;
     673        }
     674        $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
     675
     676    }
     677
     678    # one letter latex accent commands that affect following letter
     679        while ($text =~ m@\\([uvcH]) ([\w])@) {
     680          my $tex="$1 $2"; my $char="$2";
     681          my $replacement=$special_utf8_chars{$tex};
     682          if (!defined($replacement)) {
     683          print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
     684          $replacement=$char;
     685      }
     686          $text =~ s/\\$tex/$replacement/g;
     687        }
    575688    }
     689
     690    # escape html-sensitive characters
     691    $text =~ s@&@&amp;@g;
     692    $text =~ s@<@&lt;@g;
     693    $text =~ s@>@&gt;@g;
     694    $text =~ s/''/"/g; # Latex-specific
     695    $text =~ s/``/"/g; # Latex-specific
     696    # greenstone-specific
     697    $text =~ s@\[@&\#91;@g;
     698    $text =~ s@\]@&\#93;@g;
     699
    576700    # remove latex commands
    577     # commands with optional arguments...
     701
     702    # explicitly recognised commands
     703    $text =~ s@\\ldots@&hellip;@g;
     704
     705    # maths mode
     706    $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
     707
     708    # remove all other commands with optional arguments...
    578709    $text =~ s@\\\w+(\[.*?\])?\s*@@g;
    579710    # $text =~ s@\\noopsort{[^}]+\}@@g;
     
    581712   
    582713    # remove latex groupings { } (but not \{ or \} )
    583     while ($text =~ s/([^\\])[\{\}]/$1/g) {;}
     714    while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
    584715    $text =~ s/^\{//; # remove { if first char
    585    
    586     # maths mode $...$ - this is not interpreted in any way at the moment...
    587     $text =~ s@\$(.*)\$@$1@g;
    588716   
    589717    # latex characters
    590718    # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
    591     $text =~ s/([^\\])~+/$1/g; # non-breaking space  "~"
     719    $text =~ s/([^\\])~+/$1 /g; # non-breaking space  "~"
    592720    # optional break "\-"
    593     if ($text =~ m/\#/) { # concat macros (bibtex)
     721    if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
    594722    # the non-macro bits have quotes around them - we just remove quotes
    595     $text =~ s/[\"\#]//g;
     723    $text =~ s/([^&])[\"\#]/$1/g;
    596724    }
     725    # dashes. Convert (m|n)-dash into single dash for html.
     726    $text =~ s@\-\-+@\-@g;
     727
    597728    # quoted { } chars
    598729    $text =~ s@\\{@{@g;
    599730    $text =~ s@\\}@}@g;
    600731
     732    # finally to protect against macro language...
     733    $text =~ s@\\@\\\\@g;
     734
    601735    return $text;
    602736}
    603737
     738
     739sub process_latex_math {
     740    my $text = shift;
     741
     742    $text =~ s@\\infty@infinity@g;       # or unicode 0x221E...
     743    $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
     744    $text =~ s@\^([^\{])@<sup>$1</sup>@g;
     745    $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
     746    $text =~ s@\_([^\{])@<sub>$1</sub>@g;
     747
     748    # put all other command names in italics
     749    $text =~ s@\\([\w]+)@<i>$1</i>@g;
     750
     751    return $text;
     752}
    604753
    605754sub set_OID {
Note: See TracChangeset for help on using the changeset viewer.