Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 10723

Last change on this file since 10723 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 23.3 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
	3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
	29	# BibTexPlug reads bibliography files in BibTex format.
	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
	34	# BibTexPlug creates a document object for every reference a the file.
	35	# It is a subclass of SplitPlug, so if there are multiple records, all
	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
[1676]	42
[3587]	43
[1676]	44	package BibTexPlug;
	45
	46	use SplitPlug;
[10254]	47	use strict;
	48	no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]	49
	50	# BibTexPlug is a sub-class of BasPlug.
	51	sub BEGIN {
[10254]	52	@BibTexPlug::ISA = ('SplitPlug');
[1676]	53	}
	54
[3540]	55	my $arguments =
[4744]	56	[ { 'name' => "process_exp",
[4873]	57	'desc' => "{BasPlug.process_exp}",
[6408]	58	'type' => "regexp",
[4873]	59	'reqd' => "no",
[6408]	60	'deft' => &get_default_process_exp() },
	61	{ 'name' => "split_exp",
	62	'desc' => "{SplitPlug.split_exp}",
	63	'type' => "regexp",
	64	'deft' => &get_default_split_exp(),
	65	'reqd' => "no" }
	66	];
[3540]	67
[4744]	68	my $options = { 'name' => "BibTexPlug",
[5680]	69	'desc' => "{BibTexPlug.desc}",
[6408]	70	'abstract' => "no",
	71	'inherits' => "yes",
[9120]	72	'explodes' => "yes",
[4744]	73	'args' => $arguments };
[3540]	74
[1676]	75	# This plugin processes files with the suffix ".bib"
	76	sub get_default_process_exp {
	77	return q^(?i)\.bib$^;
	78	}
	79
	80	# This plugin splits the input text at blank lines
	81	sub get_default_split_exp {
	82	return q^\n+(?=@)^;
	83	}
[3540]	84	sub new {
[10218]	85	my ($class) = shift (@_);
	86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	87	push(@$pluginlist, $class);
	88
	89	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	90	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	91
	92	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
	93
[3540]	94	return bless $self, $class;
	95	}
[1676]	96
[2901]	97
	98
[1676]	99	# The process function reads a single bibliographic record and stores
	100	# it as a new document.
	101
	102	sub process {
	103	my $self = shift (@_);
[6332]	104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]	105	my $outhandle = $self->{'outhandle'};
	106
[10254]	107	my $cursection = $doc_obj->get_top_section();
[2484]	108	$self->{'key'} = "default";
	109
[1676]	110	# Check that we're dealing with a valid BibTex record
	111	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	112
	113	# Ignore things we can't use
	114	return 0 if ($$textref =~ /^\@String/);
	115
	116	# Report that we're processing the file
[6332]	117	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
[1676]	118	print $outhandle "BibTexPlug: processing $file\n"
	119	if ($self->{'verbosity'}) > 1;
	120
	121
[1711]	122	# This hash translates BibTex field names into metadata names. The
[1676]	123	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	124	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	125
[9582]	126	# The author metadata will be stored as one "Creator" entry, but will
	127	# also be split up into several individual "Author" fields.
	128
[1676]	129	my %field = (
	130	'address', 'PublisherAddress',
	131	'author', 'Creator',
[2901]	132
[1676]	133	'booktitle', 'Booktitle',
	134	'chapter', 'Chapter',
	135	'edition', 'Edition',
[2901]	136	'editor', 'Editor',
[1676]	137	'institution', 'Publisher',
	138	'journal', 'Journal',
	139	'month', 'Month',
	140	'number', 'Number',
	141	'pages', 'Pages',
	142	'publisher', 'Publisher',
	143	'school', 'Publisher',
	144	'title', 'Title',
	145	'volume', 'Volume',
[3142]	146	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]	147
	148	'keywords', 'Keywords',
	149	'abstract', 'Abstract',
[4792]	150	'copyright', 'Copyright',
	151	'note', 'Note',
[8121]	152	);
[1676]	153
	154	# Metadata fields
	155	my %metadata;
[4792]	156	my ($EntryType, $Creator, $Keywords, $text);
[1676]	157
[2901]	158	my $verbosity = $self->{'verbosity'};
	159	$verbosity = 0 unless $verbosity;
	160
[3142]	161	# Make sure the text has exactly one entry per line -
	162	# append line to previous if it doesn't start with " <key> = "
[2901]	163
[4792]	164	my @input_lines=split(/\r?\n/, $$textref);
[3142]	165	my @all_lines;
	166	my $entry_line=shift @input_lines;
	167	foreach my $input_line (@input_lines) {
	168	if ($input_line =~ m/^\s\w+\s=\s*/) {
	169	# this is a new key
	170	push(@all_lines, $entry_line);
	171	$entry_line=$input_line;
	172	} else {
	173	# this is a continuation of previous line
[3249]	174	$entry_line .= " " . $input_line;
[3142]	175	}
	176
	177	}
	178	# add final line, removing trailing '}'
	179	$entry_line =~ s/\}\s*$//;
	180	push(@all_lines, $entry_line);
	181	push(@all_lines, "}");
[2901]	182
[1676]	183	# Read and process each line in the bib file.
[2901]	184	my ($entryname, $name, $value, $line);
	185	foreach $line (@all_lines) {
	186
[1676]	187	# Add each line. Most lines consist of a field identifer and
	188	# then data, and we simply store them, though we treat some
	189	# of the fields a bit differently.
	190
	191	$line =~ s/\s+/ /g;
	192	$text .= "$line\n";
	193
[4792]	194
[2901]	195	print "Processing line = $line \n" if $verbosity>=4;
	196
[1676]	197	# The first line is special, it contains the reference type and OID
[4792]	198	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
[1676]	199	$EntryType = $1;
[4792]	200	my $EntryID = (defined $2) ? $2 : "default";
[2484]	201	print "** $EntryType - \"$EntryID\" \n"
	202	if ($verbosity >= 4);
[4792]	203
[1676]	204	next;
	205	}
[2484]	206	if ($line =~ /\@/) {
[2901]	207	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	208	if ($verbosity >= 2);
[2901]	209	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	210	if ($verbosity >= 2);
	211	}
[1676]	212
	213	# otherwise, parse the metadata out of this line
[9582]	214	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
[2901]	215	$entryname = lc($1);
[1676]	216	$value = $2;
[3112]	217	$value =~ s/,?\s*$//; # remove trailing comma and space
	218	if ($value =~ /^"/ && $value =~ /"$/) {
	219	# remove surrounding " marks
	220	$value =~ s/^"//; $value =~ s/"$//;
	221	}
[2901]	222	$value = &process_latex($value);
	223
[1676]	224	# Add this line of metadata
[3426]	225	$metadata{$entryname} .= "$value";
[2901]	226
[1676]	227	}
	228
[1677]	229	# Add the Entry type as metadata
[2901]	230	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]	231
	232	#Add the fileformat as metadata
	233	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]	234
[8121]	235
[1676]	236	# Add the various field as metadata
[2901]	237	foreach my $entryname (keys %metadata) {
	238	next unless (defined $field{$entryname});
	239	next unless (defined $metadata{$entryname});
[1676]	240
[2901]	241	$name = $field{$entryname};
	242	$value = $metadata{$entryname};
[1676]	243
[3142]	244	if ($name =~ /^Month/) {
	245	$value=expand_month($value);
	246	}
[1676]	247
	248	# Several special operatons on metadata follow
	249
	250	# Add individual keywords.
	251	# The full set of keywords will be added, in due course, as "Keywords".
	252	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	253	if ($entryname eq "keywords") {
[1676]	254	my @keywordlist = split(/,/, $value);
	255	foreach my $k (@keywordlist) {
[2901]	256	$k = lc($k);
[1676]	257	$k =~ s/\s*$//;
	258	$k =~ s/^\s*//;
	259	if ($k =~ /\w/) {
[2901]	260	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	261	}
	262	}
	263	}
	264
	265	# Add individual authors
	266	# The author metadata will be stored as one "Creator" entry, but we
	267	# also want to split it into several individual "Author" fields in
[3112]	268	# "Lastname, Firstnames" format so we can browse it.
[2901]	269	if ($entryname eq "author") { #added also comparison with editor
	270
[3249]	271	# take care of "et al."...
[9582]	272	my $etal='';
	273	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
	274	$etal=' <em>et. al.</em>';
	275	}
[2901]	276	# und here for german language...
	277	# don't use brackets in pattern, else the matched bit becomes
	278	# an element in the list!
[3249]	279	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
	280	my @formattedlist = ();
[1676]	281	foreach $a (@authorlist) {
	282	$a =~ s/\s*$//;
	283	$a =~ s/^\s*//;
	284	# Reformat and add author name
[2901]	285	next if $a=~ /^\s*$/;
[3249]	286
	287	# names are "First von Last", "von Last, First"
	288	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	289	my $first="";
	290	my $vonlast="";
	291	my $jr="";
[2901]	292
[3249]	293	if ($a =~ /,/) {
	294	my @parts=split(/,\s*/, $a);
	295	$first = pop @parts;
	296	if (scalar(@parts) == 2) {
	297	$jr = pop @parts;
	298	}
	299	$vonlast=shift @parts;
	300	if (scalar(@parts) > 0) {
	301	print $outhandle "BibTexPlug: couldn't parse name $a\n";
	302	# but we continue anyway...
	303	}
	304	} else { # First von Last
	305	my @words = split(/ /, $a);
	306	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
	307	$first .= " " . shift (@words);
	308	}
	309	$first =~ s/^\s//;
	310	$vonlast = join (' ', @words); # whatever's left...
	311	}
	312	my $von="";
	313	my $last="";
	314	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	315	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	316	$von = $1;
	317	if (!defined ($von)) {
	318	# some non-English names do start with lowercase
	319	# eg "Marie desJardins". Also we can get typos...
	320	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
	321	$von="";
	322	if ($vonlast =~ /^[a-z]+$/) {
	323	# if it's all lowercase, uppercase 1st.
	324	$vonlast =~ s/^(.)/\u$1/;
[2901]	325
[3249]	326	}
	327	}
	328	$von =~ s/\s*$//;
	329	$last=$vonlast;
	330	} else {
	331	$last=$vonlast;
	332	}
	333	my $wholename="$first $von $last $jr";
[9582]	334	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
	335	$wholename =~ s/ $//;
[3249]	336	push (@formattedlist, $wholename);
	337	my $fullname = "$last";
	338	$fullname .= " $jr" if ($jr);
	339	$fullname .= ", $first";
	340	$fullname .= " $von" if ($von);
	341
[1676]	342	# Add each name to set of Authors
[3249]	343	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	344	}
[3249]	345
	346	# Only want at most one "and" in the Creator field
	347	if (scalar(@formattedlist) > 2) {
	348	my $lastauthor=pop @formattedlist;
	349	$value=join(', ', @formattedlist);
	350	$value.=" and $lastauthor";
	351	} else { # 1 or 2 authors...
	352	$value=join(" and ",@formattedlist);
	353	}
	354	$value.=$etal; # if there was "et al."
[1676]	355	}
	356
	357	# Books and Journals are additionally marked for display purposes
[2901]	358	if ($entryname eq "booktitle") {
	359	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	360	} elsif ($entryname eq "journal") {
	361	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	362	}
	363
[3249]	364	# Add the various fields as metadata
	365	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1676]	366	}
	367
[9582]	368	# for books and journals...
	369	if (!exists $metadata{'title'}) {
	370	my $name=$field{'title'}; # get Greenstone metadata name
	371	my $value;
	372	if (exists $metadata{'booktitle'}) {
	373	$value=$metadata{'booktitle'};
	374	} elsif (exists $metadata{'journal'}) {
	375	$value=$metadata{'journal'};
	376	}
	377	if ($value) {
	378	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
	379	}
	380	}
	381
[3249]	382	# Add Date (yyyymmdd) metadata
	383	if (defined ($metadata{'year'}) ) {
	384	my $date=$metadata{'year'};
	385	chomp $date;
	386	my $month=$metadata{'month'};
	387	if (defined($month)) {
	388	# month is currently 3 letter code or a range...
	389	$month = expand_month($month);
	390	# take the first month found... might not find one!
	391	$month =~ m/_textmonth(\d\d)_/;
	392	$month = $1;
	393	}
	394	if (!defined($month)) {
	395	$month="00";
	396	}
	397	$date .= "${month}00";
	398	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	399	}
	400
	401	# # Add the text in BibTex format (all fields)
[1676]	402	if ($text =~ /\w/) {
[3249]	403
	404	$text =~ s@&@&@g;
	405	$text =~ s@<@<@g;
	406	$text =~ s@>@>@g;
	407	$text =~ s@\n@<br/>\n@g;
	408	$text =~ s@\\@\\\\@g;
	409
[2901]	410	$doc_obj->add_utf8_text ($cursection, $text);
[1676]	411	}
	412
[1677]	413	return 1;
[1676]	414	}
	415
[1677]	416
[2901]	417
	418
	419	# convert email addresses and URLs into links
	420	sub convert_urls_into_links{
	421	my ($text) = @_;
	422
	423	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]	424	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]	425
	426	return $text;
	427	}
	428
	429	# Clean up whitespace and convert \n charaters to <BR> or <P>
	430	sub clean_up_whitespaces{
	431	my ($text) = @_;
	432
	433	$text =~ s/%%%%%/<BR> <BR>/g;
	434	$text =~ s/ +/ /g;
	435	$text =~ s/\s*$//;
	436	$text =~ s/^\s*//;
	437	$text =~ s/\n/\n<BR>/g;
	438	$text =~ s/<BR>\s*<BR>/<P>/g;
	439
	440	return $text;
	441	}
	442
	443
	444	sub convert_problem_characters_without_ampersand{
	445	my ($text) = @_;
	446	$text =~ s/</</g;
	447	$text =~ s/>/>/g;
	448
	449	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	450	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	451
	452	$text =~ s/\"/"/g;
	453	$text =~ s/\'/’/g;
	454	$text =~ s/\`/‘/g;
	455
[3249]	456	# $text =~ s/\+/ /g;
	457	# $text =~ s/\(/ /g;
	458	# $text =~ s/\)/ /g;
	459
[2901]	460	$text =~ s/\\/\\\\/g;
	461
[3249]	462	# $text =~ s/\./\\\./g;
[2901]	463
	464	return $text;
	465	}
	466
[1676]	467	# Convert a text string into HTML.
	468
[1677]	469	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	470	# careful not to use symbols like ">", which occurs frequently in email
[1677]	471	# messages (and use &gt instead.
	472
	473	# This function also turns URLs and email addresses into links, and
	474	# replaces carriage returns with <BR> tags (and multiple carriage returns
	475	# with <P> tags).
	476
[1676]	477	sub text_into_html {
	478	my ($text) = @_;
	479
[2901]	480	# Convert problem characters into HTML symbols
[1676]	481	$text =~ s/&/&/g;
	482
[2901]	483	$text = &convert_problem_characters_without_ampersand( $text );
	484
[1676]	485	# convert email addresses and URLs into links
[2901]	486	$text = &convert_urls_into_links( $text );
[1676]	487
[2901]	488	$text = &clean_up_whitespaces( $text );
[1676]	489
	490	return $text;
	491	}
	492
[2901]	493
[3142]	494	sub expand_month {
	495	my $text=shift;
[2901]	496
[3142]	497	# bibtex style files expand abbreviations for months.
	498	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	499	$text =~ s/jan/_textmonth01_/g;
	500	$text =~ s/feb/_textmonth02_/g;
	501	$text =~ s/mar/_textmonth03_/g;
	502	$text =~ s/apr/_textmonth04_/g;
	503	$text =~ s/may/_textmonth05_/g;
	504	$text =~ s/jun/_textmonth06_/g;
	505	$text =~ s/jul/_textmonth07_/g;
	506	$text =~ s/aug/_textmonth08_/g;
	507	$text =~ s/sep/_textmonth09_/g;
	508	$text =~ s/oct/_textmonth10_/g;
	509	$text =~ s/nov/_textmonth11_/g;
	510	$text =~ s/dec/_textmonth12_/g;
[2901]	511
[3142]	512	return $text;
	513	}
	514
	515
[2901]	516	# Convert accented characters, remove { }, interprete some commands....
	517	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	518	sub process_latex {
	519	my ($text) = @_;
	520
	521	# note - this is really ugly, but it works. There may be a prettier way
	522	# of mapping latex accented chars to utf8, but we just brute force it here.
	523	# Also, this isn't complete - not every single possible accented letter
	524	# is in here yet, but most of the common ones are.
	525
	526	my %utf8_chars =
	527	(
	528	# acutes
	529	'\'a' => chr(0xc3).chr(0xa1),
	530	'\'c' => chr(0xc4).chr(0x87),
	531	'\'e' => chr(0xc3).chr(0xa9),
	532	'\'i' => chr(0xc3).chr(0xad),
	533	'\'l' => chr(0xc3).chr(0xba),
	534	'\'n' => chr(0xc3).chr(0x84),
	535	'\'o' => chr(0xc3).chr(0xb3),
	536	'\'r' => chr(0xc5).chr(0x95),
	537	'\'s' => chr(0xc5).chr(0x9b),
	538	'\'u' => chr(0xc3).chr(0xba),
	539	'\'y' => chr(0xc3).chr(0xbd),
	540	'\'z' => chr(0xc5).chr(0xba),
	541	# graves
	542	'`a' => chr(0xc3).chr(0xa0),
	543	'`A' => chr(0xc3).chr(0x80),
	544	'`e' => chr(0xc3).chr(0xa8),
	545	'`E' => chr(0xc3).chr(0x88),
	546	'`i' => chr(0xc3).chr(0xac),
	547	'`I' => chr(0xc3).chr(0x8c),
	548	'`o' => chr(0xc3).chr(0xb2),
	549	'`O' => chr(0xc3).chr(0x92),
	550	'`u' => chr(0xc3).chr(0xb9),
	551	'`U' => chr(0xc3).chr(0x99),
	552	# circumflex
	553	'^a' => chr(0xc3).chr(0xa2),
	554	'^A' => chr(0xc3).chr(0x82),
	555	'^c' => chr(0xc4).chr(0x89),
	556	'^C' => chr(0xc4).chr(0x88),
	557	'^e' => chr(0xc3).chr(0xaa),
	558	'^E' => chr(0xc3).chr(0x8a),
	559	'^g' => chr(0xc4).chr(0x9d),
	560	'^G' => chr(0xc4).chr(0x9c),
	561	'^h' => chr(0xc4).chr(0xa5),
	562	'^H' => chr(0xc4).chr(0xa4),
	563	'^i' => chr(0xc3).chr(0xae),
	564	'^I' => chr(0xc3).chr(0x8e),
	565	'^j' => chr(0xc4).chr(0xb5),
	566	'^J' => chr(0xc4).chr(0xb4),
	567	'^o' => chr(0xc3).chr(0xb4),
	568	'^O' => chr(0xc3).chr(0x94),
	569	'^s' => chr(0xc5).chr(0x9d),
	570	'^S' => chr(0xc5).chr(0x9c),
	571	'^u' => chr(0xc3).chr(0xa2),
	572	'^U' => chr(0xc3).chr(0xbb),
	573	'^w' => chr(0xc5).chr(0xb5),
	574	'^W' => chr(0xc5).chr(0xb4),
	575	'^y' => chr(0xc5).chr(0xb7),
	576	'^Y' => chr(0xc5).chr(0xb6),
	577
	578	# diaeresis
	579	'"a' => chr(0xc3).chr(0xa4),
	580	'"A' => chr(0xc3).chr(0x84),
	581	'"e' => chr(0xc3).chr(0xab),
	582	'"E' => chr(0xc3).chr(0x8b),
	583	'"\\\\i' => chr(0xc3).chr(0xaf),
	584	'"\\\\I' => chr(0xc3).chr(0x8f),
	585	'"o' => chr(0xc3).chr(0xb6),
	586	'"O' => chr(0xc3).chr(0x96),
	587	'"u' => chr(0xc3).chr(0xbc),
	588	'"U' => chr(0xc3).chr(0x9c),
	589	'"y' => chr(0xc3).chr(0xbf),
	590	'"Y' => chr(0xc3).chr(0xb8),
	591	# tilde
[3142]	592	'~A' => chr(0xc3).chr(0x83),
	593	'~N' => chr(0xc3).chr(0x91),
	594	'~O' => chr(0xc3).chr(0x95),
	595	'~a' => chr(0xc3).chr(0xa3),
	596	'~n' => chr(0xc3).chr(0xb1),
	597	'~o' => chr(0xc3).chr(0xb5),
[2901]	598	# caron - handled specially
	599	# double acute
	600	# ring
	601	# dot
[3249]	602	'.c' => chr(0xc4).chr(0x8b),
	603	'.C' => chr(0xc4).chr(0x8a),
	604	'.e' => chr(0xc4).chr(0x97),
	605	'.E' => chr(0xc4).chr(0x96),
	606	'.g' => chr(0xc4).chr(0xa1),
	607	'.G' => chr(0xc4).chr(0xa0),
	608	'.I' => chr(0xc4).chr(0xb0),
	609	'.z' => chr(0xc5).chr(0xbc),
	610	'.Z' => chr(0xc5).chr(0xbb),
[2901]	611	# macron
	612	'=a' => chr(0xc4).chr(0x81),
	613	'=A' => chr(0xc4).chr(0x80),
	614	'=e' => chr(0xc4).chr(0x93),
	615	'=E' => chr(0xc4).chr(0x92),
	616	'=i' => chr(0xc4).chr(0xab),
	617	'=I' => chr(0xc4).chr(0xaa),
	618	'=o' => chr(0xc4).chr(0x8d),
	619	'=O' => chr(0xc4).chr(0x8c),
	620	'=u' => chr(0xc4).chr(0xab),
	621	'=U' => chr(0xc4).chr(0xaa),
	622
	623	# stroke - handled specially - see below
	624
	625	# cedilla - handled specially
	626	);
	627
	628	# these are one letter latex commands - we make sure they're not a longer
	629	# command name. eg {\d} is d+stroke, so careful of \d
	630	my %special_utf8_chars =
	631	(
[3156]	632	# breve
	633	'u g' => chr(0xc4).chr(0x9f),
	634	'u G' => chr(0xc4).chr(0x9e),
	635	'u i' => chr(0xc4).chr(0xad),
	636	'u I' => chr(0xc4).chr(0xac),
	637	'u o' => chr(0xc5).chr(0x8f),
	638	'u O' => chr(0xc5).chr(0x8e),
	639	'u u' => chr(0xc5).chr(0xad),
	640	'u U' => chr(0xc5).chr(0xac),
[2901]	641	# caron
[3156]	642	'v c' => chr(0xc4).chr(0x8d),
	643	'v C' => chr(0xc4).chr(0x8c),
[2901]	644	'v n' => chr(0xc5).chr(0x88),
	645	'v N' => chr(0xc5).chr(0x87),
	646	'v s' => chr(0xc5).chr(0xa1),
	647	'v S' => chr(0xc5).chr(0xa5),
[3156]	648	'v z' => chr(0xc5).chr(0xbe),
	649	'v Z' => chr(0xc5).chr(0xbd),
[2901]	650	# cedilla
	651	'c c' => chr(0xc3).chr(0xa7),
	652	'c C' => chr(0xc3).chr(0x87),
	653	'c g' => chr(0xc4).chr(0xa3),
	654	'c G' => chr(0xc4).chr(0xa2),
	655	'c k' => chr(0xc4).chr(0xb7),
	656	'c K' => chr(0xc4).chr(0xb6),
	657	'c l' => chr(0xc4).chr(0xbc),
	658	'c L' => chr(0xc4).chr(0xbb),
	659	'c n' => chr(0xc5).chr(0x86),
	660	'c N' => chr(0xc5).chr(0x85),
	661	'c r' => chr(0xc5).chr(0x97),
	662	'c R' => chr(0xc5).chr(0x96),
	663	'c s' => chr(0xc5).chr(0x9f),
	664	'c S' => chr(0xc5).chr(0x9e),
	665	'c t' => chr(0xc5).chr(0xa3),
	666	'c T' => chr(0xc5).chr(0xa2),
	667	# double acute / Hungarian accent
	668	'H O' => chr(0xc5).chr(0x90),
	669	'H o' => chr(0xc5).chr(0x91),
	670	'H U' => chr(0xc5).chr(0xb0),
	671	'H u' => chr(0xc5).chr(0xb1),
	672
	673	# stroke
	674	'd' => chr(0xc4).chr(0x91),
	675	'D' => chr(0xc4).chr(0x90),
	676	'h' => chr(0xc4).chr(0xa7),
	677	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	678	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	679	'l' => chr(0xc5).chr(0x82),
	680	'L' => chr(0xc5).chr(0x81),
	681	'o' => chr(0xc3).chr(0xb8),
	682	'O' => chr(0xc3).chr(0x98),
	683	't' => chr(0xc5).chr(0xa7),
	684	'T' => chr(0xc5).chr(0xa6),
	685	# german ss/szlig/sharp s
	686	'ss' => chr(0xc3).chr(0x9f),
	687	);
	688
	689	# convert latex-style accented characters.
[3249]	690
[2901]	691	# remove space (if any) between \ and letter to accent (eg {\' a})
	692
[3249]	693	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
[2901]	694
	695	# remove {} around a single character (eg \'{e})
[3249]	696	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
[2901]	697
[3249]	698	# \, is another way of doing cedilla \c
	699	$text =~ s@\\,(.)@\\c $1@g;
	700
[2901]	701	# remove {} around a single character for special 1 letter commands -
	702	# need to insert a space. Eg \v{s} -> {\v s}
[3156]	703	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]	704
[2901]	705	# only do if the text contains a '\' character.
	706	if ($text =~ m\|\\\|) {
[3249]	707	# "normal" accents - ie non-alpha latex tag
	708	while ($text =~ m@\\([`'="^~\.])([\w])@) {
	709	my $tex="$1$2"; my $char="$2";
	710	my $replacement=$utf8_chars{$tex};
	711	if (!defined($replacement)) {
	712	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	713	$replacement=$char;
	714	}
[5454]	715	$text =~ s/\\\Q$tex/$replacement/g;
[3249]	716	}
[2901]	717
[3249]	718	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	719	# only do the change if immediately followed by a space, }, {, or \
	720	# one letter accents ( + ss)
	721	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
	722	my $tex=$1;
	723	my $replacement=$special_utf8_chars{$tex};
	724	if (!defined($replacement)) {
	725	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	726	$replacement=$tex;
	727	}
	728	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	729
	730	}
	731
	732	# one letter latex accent commands that affect following letter
	733	while ($text =~ m@\\([uvcH]) ([\w])@) {
	734	my $tex="$1 $2"; my $char="$2";
	735	my $replacement=$special_utf8_chars{$tex};
	736	if (!defined($replacement)) {
	737	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	738	$replacement=$char;
	739	}
	740	$text =~ s/\\$tex/$replacement/g;
	741	}
[2901]	742	}
[3249]	743
	744	# escape html-sensitive characters
	745	$text =~ s@&@&@g;
	746	$text =~ s@<@<@g;
	747	$text =~ s@>@>@g;
	748	$text =~ s/''/"/g; # Latex-specific
	749	$text =~ s/``/"/g; # Latex-specific
	750	# greenstone-specific
	751	$text =~ s@\[@&\#91;@g;
	752	$text =~ s@\]@&\#93;@g;
	753
[3112]	754	# remove latex commands
[3249]	755
	756	# explicitly recognised commands
	757	$text =~ s@\\ldots@…@g;
	758
	759	# maths mode
	760	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
	761
	762	# remove all other commands with optional arguments...
[3156]	763	$text =~ s@\\\w+(\[.?\])?\s@@g;
	764	# $text =~ s@\\noopsort{[^}]+\}@@g;
	765	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
[3142]	766
[2901]	767	# remove latex groupings { } (but not \{ or \} )
[3249]	768	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]	769	$text =~ s/^\{//; # remove { if first char
	770
	771	# latex characters
	772	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]	773	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]	774	# optional break "\-"
[3249]	775	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]	776	# the non-macro bits have quotes around them - we just remove quotes
[3249]	777	$text =~ s/([^&])[\"\#]/$1/g;
[3142]	778	}
[3249]	779	# dashes. Convert (m\|n)-dash into single dash for html.
	780	$text =~ s@\-\-+@\-@g;
	781
[2901]	782	# quoted { } chars
	783	$text =~ s@\\{@{@g;
	784	$text =~ s@\\}@}@g;
	785
[3249]	786	# finally to protect against macro language...
	787	$text =~ s@\\@\\\\@g;
	788
[2901]	789	return $text;
	790	}
	791
	792
[3249]	793	sub process_latex_math {
	794	my $text = shift;
	795
	796	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
	797	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
	798	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
	799	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
	800	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
	801
	802	# put all other command names in italics
	803	$text =~ s@\\([\w]+)@<i>$1</i>@g;
	804
	805	return $text;
	806	}
	807
[2484]	808	sub set_OID {
	809	my $self = shift (@_);
	810	my ($doc_obj, $id, $segment_number) = @_;
	811
	812	if ( $self->{'key'} eq "default") {
[4792]	813	$doc_obj->set_OID("$id\_$segment_number");
[2484]	814	} else {
	815	$doc_obj->set_OID($self->{'key'});
	816	}
	817	}
[1676]	818
[2484]	819	1;
[4792]	820
	821

Note: See TracBrowser for help on using the repository browser.

Download in other formats: