Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 13416

Last change on this file since 13416 was 13416, checked in by kjdon, 17 years ago
remove surrounding {} as well as surrounding "" from value
Property svn:keywords set to `Author Date Id Revision`
File size: 23.6 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
	3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
	29	# BibTexPlug reads bibliography files in BibTex format.
	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
	34	# BibTexPlug creates a document object for every reference a the file.
	35	# It is a subclass of SplitPlug, so if there are multiple records, all
	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
[1676]	42
[3587]	43
[1676]	44	package BibTexPlug;
	45
	46	use SplitPlug;
[10254]	47	use strict;
	48	no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]	49
	50	# BibTexPlug is a sub-class of BasPlug.
	51	sub BEGIN {
[10254]	52	@BibTexPlug::ISA = ('SplitPlug');
[1676]	53	}
	54
[3540]	55	my $arguments =
[4744]	56	[ { 'name' => "process_exp",
[4873]	57	'desc' => "{BasPlug.process_exp}",
[6408]	58	'type' => "regexp",
[4873]	59	'reqd' => "no",
[6408]	60	'deft' => &get_default_process_exp() },
	61	{ 'name' => "split_exp",
	62	'desc' => "{SplitPlug.split_exp}",
	63	'type' => "regexp",
	64	'deft' => &get_default_split_exp(),
	65	'reqd' => "no" }
	66	];
[3540]	67
[4744]	68	my $options = { 'name' => "BibTexPlug",
[5680]	69	'desc' => "{BibTexPlug.desc}",
[6408]	70	'abstract' => "no",
	71	'inherits' => "yes",
[9120]	72	'explodes' => "yes",
[4744]	73	'args' => $arguments };
[3540]	74
[1676]	75	# This plugin processes files with the suffix ".bib"
	76	sub get_default_process_exp {
	77	return q^(?i)\.bib$^;
	78	}
	79
	80	# This plugin splits the input text at blank lines
	81	sub get_default_split_exp {
	82	return q^\n+(?=@)^;
	83	}
[3540]	84	sub new {
[10218]	85	my ($class) = shift (@_);
	86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	87	push(@$pluginlist, $class);
	88
	89	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	90	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	91
[12169]	92	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
[10218]	93
[3540]	94	return bless $self, $class;
	95	}
[1676]	96
[2901]	97
	98
[1676]	99	# The process function reads a single bibliographic record and stores
	100	# it as a new document.
	101
	102	sub process {
	103	my $self = shift (@_);
[6332]	104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]	105	my $outhandle = $self->{'outhandle'};
	106
[10254]	107	my $cursection = $doc_obj->get_top_section();
[2484]	108	$self->{'key'} = "default";
	109
[1676]	110	# Check that we're dealing with a valid BibTex record
	111	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	112
	113	# Ignore things we can't use
	114	return 0 if ($$textref =~ /^\@String/);
	115
	116	# Report that we're processing the file
[6332]	117	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
[1676]	118	print $outhandle "BibTexPlug: processing $file\n"
	119	if ($self->{'verbosity'}) > 1;
	120
	121
[1711]	122	# This hash translates BibTex field names into metadata names. The
[1676]	123	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	124	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	125
[9582]	126	# The author metadata will be stored as one "Creator" entry, but will
	127	# also be split up into several individual "Author" fields.
	128
[1676]	129	my %field = (
	130	'address', 'PublisherAddress',
	131	'author', 'Creator',
[2901]	132
[1676]	133	'booktitle', 'Booktitle',
	134	'chapter', 'Chapter',
	135	'edition', 'Edition',
[2901]	136	'editor', 'Editor',
[1676]	137	'institution', 'Publisher',
	138	'journal', 'Journal',
	139	'month', 'Month',
	140	'number', 'Number',
	141	'pages', 'Pages',
	142	'publisher', 'Publisher',
	143	'school', 'Publisher',
	144	'title', 'Title',
	145	'volume', 'Volume',
[3142]	146	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]	147
	148	'keywords', 'Keywords',
	149	'abstract', 'Abstract',
[4792]	150	'copyright', 'Copyright',
	151	'note', 'Note',
[11527]	152	'url', 'URL',
[8121]	153	);
[1676]	154
	155	# Metadata fields
	156	my %metadata;
[4792]	157	my ($EntryType, $Creator, $Keywords, $text);
[1676]	158
[2901]	159	my $verbosity = $self->{'verbosity'};
	160	$verbosity = 0 unless $verbosity;
	161
[3142]	162	# Make sure the text has exactly one entry per line -
	163	# append line to previous if it doesn't start with " <key> = "
[2901]	164
[4792]	165	my @input_lines=split(/\r?\n/, $$textref);
[3142]	166	my @all_lines;
	167	my $entry_line=shift @input_lines;
	168	foreach my $input_line (@input_lines) {
	169	if ($input_line =~ m/^\s\w+\s=\s*/) {
	170	# this is a new key
	171	push(@all_lines, $entry_line);
	172	$entry_line=$input_line;
	173	} else {
	174	# this is a continuation of previous line
[3249]	175	$entry_line .= " " . $input_line;
[3142]	176	}
	177
	178	}
	179	# add final line, removing trailing '}'
	180	$entry_line =~ s/\}\s*$//;
	181	push(@all_lines, $entry_line);
	182	push(@all_lines, "}");
[2901]	183
[1676]	184	# Read and process each line in the bib file.
[2901]	185	my ($entryname, $name, $value, $line);
	186	foreach $line (@all_lines) {
	187
[1676]	188	# Add each line. Most lines consist of a field identifer and
	189	# then data, and we simply store them, though we treat some
	190	# of the fields a bit differently.
	191
	192	$line =~ s/\s+/ /g;
	193	$text .= "$line\n";
	194
[4792]	195
[2901]	196	print "Processing line = $line \n" if $verbosity>=4;
	197
[1676]	198	# The first line is special, it contains the reference type and OID
[4792]	199	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
[1676]	200	$EntryType = $1;
[4792]	201	my $EntryID = (defined $2) ? $2 : "default";
[2484]	202	print "** $EntryType - \"$EntryID\" \n"
	203	if ($verbosity >= 4);
[4792]	204
[1676]	205	next;
	206	}
[2484]	207	if ($line =~ /\@/) {
[2901]	208	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	209	if ($verbosity >= 2);
[2901]	210	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	211	if ($verbosity >= 2);
	212	}
[1676]	213
	214	# otherwise, parse the metadata out of this line
[9582]	215	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
[2901]	216	$entryname = lc($1);
[1676]	217	$value = $2;
[3112]	218	$value =~ s/,?\s*$//; # remove trailing comma and space
	219	if ($value =~ /^"/ && $value =~ /"$/) {
	220	# remove surrounding " marks
	221	$value =~ s/^"//; $value =~ s/"$//;
[13416]	222	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
	223	# remove surrounding {} marks
	224	$value =~ s/^\{//; $value =~ s/\}$//;
[3112]	225	}
[13413]	226	# special case for year - we only want a 4 digit year, not
	227	# "circa 2004" etc
	228	if ($entryname=~ /^year$/i) {
	229	if ($value=~ /(\d{4})/) {
	230	$value=$1;
	231	$metadata{$entryname} .= "$value";
	232	}
	233	}
	234	else {
	235
	236	$value = &process_latex($value);
	237	# Add this line of metadata
	238	$metadata{$entryname} .= "$value";
	239	}
[1676]	240	}
	241
[1677]	242	# Add the Entry type as metadata
[2901]	243	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]	244
	245	#Add the fileformat as metadata
	246	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]	247
[8121]	248
[1676]	249	# Add the various field as metadata
[2901]	250	foreach my $entryname (keys %metadata) {
	251	next unless (defined $field{$entryname});
	252	next unless (defined $metadata{$entryname});
[1676]	253
[2901]	254	$name = $field{$entryname};
	255	$value = $metadata{$entryname};
[1676]	256
[3142]	257	if ($name =~ /^Month/) {
	258	$value=expand_month($value);
	259	}
[1676]	260
	261	# Several special operatons on metadata follow
	262
	263	# Add individual keywords.
	264	# The full set of keywords will be added, in due course, as "Keywords".
	265	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	266	if ($entryname eq "keywords") {
[1676]	267	my @keywordlist = split(/,/, $value);
	268	foreach my $k (@keywordlist) {
[2901]	269	$k = lc($k);
[1676]	270	$k =~ s/\s*$//;
	271	$k =~ s/^\s*//;
	272	if ($k =~ /\w/) {
[2901]	273	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	274	}
	275	}
	276	}
	277
	278	# Add individual authors
	279	# The author metadata will be stored as one "Creator" entry, but we
	280	# also want to split it into several individual "Author" fields in
[3112]	281	# "Lastname, Firstnames" format so we can browse it.
[2901]	282	if ($entryname eq "author") { #added also comparison with editor
	283
[3249]	284	# take care of "et al."...
[9582]	285	my $etal='';
	286	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
	287	$etal=' <em>et. al.</em>';
	288	}
[2901]	289	# und here for german language...
	290	# don't use brackets in pattern, else the matched bit becomes
	291	# an element in the list!
[3249]	292	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
	293	my @formattedlist = ();
[1676]	294	foreach $a (@authorlist) {
	295	$a =~ s/\s*$//;
	296	$a =~ s/^\s*//;
	297	# Reformat and add author name
[2901]	298	next if $a=~ /^\s*$/;
[3249]	299
	300	# names are "First von Last", "von Last, First"
	301	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	302	my $first="";
	303	my $vonlast="";
	304	my $jr="";
[2901]	305
[3249]	306	if ($a =~ /,/) {
	307	my @parts=split(/,\s*/, $a);
	308	$first = pop @parts;
	309	if (scalar(@parts) == 2) {
	310	$jr = pop @parts;
	311	}
	312	$vonlast=shift @parts;
	313	if (scalar(@parts) > 0) {
	314	print $outhandle "BibTexPlug: couldn't parse name $a\n";
	315	# but we continue anyway...
	316	}
	317	} else { # First von Last
	318	my @words = split(/ /, $a);
	319	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
	320	$first .= " " . shift (@words);
	321	}
	322	$first =~ s/^\s//;
	323	$vonlast = join (' ', @words); # whatever's left...
	324	}
	325	my $von="";
	326	my $last="";
	327	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	328	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	329	$von = $1;
	330	if (!defined ($von)) {
	331	# some non-English names do start with lowercase
	332	# eg "Marie desJardins". Also we can get typos...
	333	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
	334	$von="";
	335	if ($vonlast =~ /^[a-z]+$/) {
	336	# if it's all lowercase, uppercase 1st.
	337	$vonlast =~ s/^(.)/\u$1/;
[2901]	338
[3249]	339	}
	340	}
	341	$von =~ s/\s*$//;
	342	$last=$vonlast;
	343	} else {
	344	$last=$vonlast;
	345	}
	346	my $wholename="$first $von $last $jr";
[9582]	347	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
	348	$wholename =~ s/ $//;
[3249]	349	push (@formattedlist, $wholename);
	350	my $fullname = "$last";
	351	$fullname .= " $jr" if ($jr);
	352	$fullname .= ", $first";
	353	$fullname .= " $von" if ($von);
	354
[1676]	355	# Add each name to set of Authors
[3249]	356	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	357	}
[3249]	358
	359	# Only want at most one "and" in the Creator field
	360	if (scalar(@formattedlist) > 2) {
	361	my $lastauthor=pop @formattedlist;
	362	$value=join(', ', @formattedlist);
	363	$value.=" and $lastauthor";
	364	} else { # 1 or 2 authors...
	365	$value=join(" and ",@formattedlist);
	366	}
	367	$value.=$etal; # if there was "et al."
[1676]	368	}
	369
	370	# Books and Journals are additionally marked for display purposes
[2901]	371	if ($entryname eq "booktitle") {
	372	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	373	} elsif ($entryname eq "journal") {
	374	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	375	}
	376
[3249]	377	# Add the various fields as metadata
	378	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]	379
[1676]	380	}
	381
[9582]	382	# for books and journals...
	383	if (!exists $metadata{'title'}) {
	384	my $name=$field{'title'}; # get Greenstone metadata name
	385	my $value;
	386	if (exists $metadata{'booktitle'}) {
	387	$value=$metadata{'booktitle'};
	388	} elsif (exists $metadata{'journal'}) {
	389	$value=$metadata{'journal'};
	390	}
	391	if ($value) {
	392	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
	393	}
	394	}
	395
[3249]	396	# Add Date (yyyymmdd) metadata
	397	if (defined ($metadata{'year'}) ) {
	398	my $date=$metadata{'year'};
	399	chomp $date;
	400	my $month=$metadata{'month'};
	401	if (defined($month)) {
	402	# month is currently 3 letter code or a range...
	403	$month = expand_month($month);
	404	# take the first month found... might not find one!
	405	$month =~ m/_textmonth(\d\d)_/;
	406	$month = $1;
	407	}
	408	if (!defined($month)) {
	409	$month="00";
	410	}
	411	$date .= "${month}00";
	412	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	413	}
	414
	415	# # Add the text in BibTex format (all fields)
[1676]	416	if ($text =~ /\w/) {
[3249]	417
	418	$text =~ s@&@&@g;
	419	$text =~ s@<@<@g;
	420	$text =~ s@>@>@g;
	421	$text =~ s@\n@<br/>\n@g;
	422	$text =~ s@\\@\\\\@g;
	423
[2901]	424	$doc_obj->add_utf8_text ($cursection, $text);
[1676]	425	}
	426
[1677]	427	return 1;
[1676]	428	}
	429
[1677]	430
[2901]	431
	432
	433	# convert email addresses and URLs into links
	434	sub convert_urls_into_links{
	435	my ($text) = @_;
	436
	437	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]	438	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]	439
	440	return $text;
	441	}
	442
	443	# Clean up whitespace and convert \n charaters to <BR> or <P>
	444	sub clean_up_whitespaces{
	445	my ($text) = @_;
	446
	447	$text =~ s/%%%%%/<BR> <BR>/g;
	448	$text =~ s/ +/ /g;
	449	$text =~ s/\s*$//;
	450	$text =~ s/^\s*//;
	451	$text =~ s/\n/\n<BR>/g;
	452	$text =~ s/<BR>\s*<BR>/<P>/g;
	453
	454	return $text;
	455	}
	456
	457
	458	sub convert_problem_characters_without_ampersand{
	459	my ($text) = @_;
	460	$text =~ s/</</g;
	461	$text =~ s/>/>/g;
	462
	463	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	464	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	465
	466	$text =~ s/\"/"/g;
	467	$text =~ s/\'/’/g;
	468	$text =~ s/\`/‘/g;
	469
[3249]	470	# $text =~ s/\+/ /g;
	471	# $text =~ s/\(/ /g;
	472	# $text =~ s/\)/ /g;
	473
[2901]	474	$text =~ s/\\/\\\\/g;
	475
[3249]	476	# $text =~ s/\./\\\./g;
[2901]	477
	478	return $text;
	479	}
	480
[1676]	481	# Convert a text string into HTML.
	482
[1677]	483	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	484	# careful not to use symbols like ">", which occurs frequently in email
[1677]	485	# messages (and use &gt instead.
	486
	487	# This function also turns URLs and email addresses into links, and
	488	# replaces carriage returns with <BR> tags (and multiple carriage returns
	489	# with <P> tags).
	490
[1676]	491	sub text_into_html {
	492	my ($text) = @_;
	493
[2901]	494	# Convert problem characters into HTML symbols
[1676]	495	$text =~ s/&/&/g;
	496
[2901]	497	$text = &convert_problem_characters_without_ampersand( $text );
	498
[1676]	499	# convert email addresses and URLs into links
[2901]	500	$text = &convert_urls_into_links( $text );
[1676]	501
[2901]	502	$text = &clean_up_whitespaces( $text );
[1676]	503
	504	return $text;
	505	}
	506
[2901]	507
[3142]	508	sub expand_month {
	509	my $text=shift;
[2901]	510
[3142]	511	# bibtex style files expand abbreviations for months.
	512	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	513	$text =~ s/jan/_textmonth01_/g;
	514	$text =~ s/feb/_textmonth02_/g;
	515	$text =~ s/mar/_textmonth03_/g;
	516	$text =~ s/apr/_textmonth04_/g;
	517	$text =~ s/may/_textmonth05_/g;
	518	$text =~ s/jun/_textmonth06_/g;
	519	$text =~ s/jul/_textmonth07_/g;
	520	$text =~ s/aug/_textmonth08_/g;
	521	$text =~ s/sep/_textmonth09_/g;
	522	$text =~ s/oct/_textmonth10_/g;
	523	$text =~ s/nov/_textmonth11_/g;
	524	$text =~ s/dec/_textmonth12_/g;
[2901]	525
[3142]	526	return $text;
	527	}
	528
	529
[2901]	530	# Convert accented characters, remove { }, interprete some commands....
	531	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	532	sub process_latex {
	533	my ($text) = @_;
	534
	535	# note - this is really ugly, but it works. There may be a prettier way
	536	# of mapping latex accented chars to utf8, but we just brute force it here.
	537	# Also, this isn't complete - not every single possible accented letter
	538	# is in here yet, but most of the common ones are.
	539
	540	my %utf8_chars =
	541	(
	542	# acutes
	543	'\'a' => chr(0xc3).chr(0xa1),
	544	'\'c' => chr(0xc4).chr(0x87),
	545	'\'e' => chr(0xc3).chr(0xa9),
	546	'\'i' => chr(0xc3).chr(0xad),
	547	'\'l' => chr(0xc3).chr(0xba),
	548	'\'n' => chr(0xc3).chr(0x84),
	549	'\'o' => chr(0xc3).chr(0xb3),
	550	'\'r' => chr(0xc5).chr(0x95),
	551	'\'s' => chr(0xc5).chr(0x9b),
	552	'\'u' => chr(0xc3).chr(0xba),
	553	'\'y' => chr(0xc3).chr(0xbd),
	554	'\'z' => chr(0xc5).chr(0xba),
	555	# graves
	556	'`a' => chr(0xc3).chr(0xa0),
	557	'`A' => chr(0xc3).chr(0x80),
	558	'`e' => chr(0xc3).chr(0xa8),
	559	'`E' => chr(0xc3).chr(0x88),
	560	'`i' => chr(0xc3).chr(0xac),
	561	'`I' => chr(0xc3).chr(0x8c),
	562	'`o' => chr(0xc3).chr(0xb2),
	563	'`O' => chr(0xc3).chr(0x92),
	564	'`u' => chr(0xc3).chr(0xb9),
	565	'`U' => chr(0xc3).chr(0x99),
	566	# circumflex
	567	'^a' => chr(0xc3).chr(0xa2),
	568	'^A' => chr(0xc3).chr(0x82),
	569	'^c' => chr(0xc4).chr(0x89),
	570	'^C' => chr(0xc4).chr(0x88),
	571	'^e' => chr(0xc3).chr(0xaa),
	572	'^E' => chr(0xc3).chr(0x8a),
	573	'^g' => chr(0xc4).chr(0x9d),
	574	'^G' => chr(0xc4).chr(0x9c),
	575	'^h' => chr(0xc4).chr(0xa5),
	576	'^H' => chr(0xc4).chr(0xa4),
	577	'^i' => chr(0xc3).chr(0xae),
	578	'^I' => chr(0xc3).chr(0x8e),
	579	'^j' => chr(0xc4).chr(0xb5),
	580	'^J' => chr(0xc4).chr(0xb4),
	581	'^o' => chr(0xc3).chr(0xb4),
	582	'^O' => chr(0xc3).chr(0x94),
	583	'^s' => chr(0xc5).chr(0x9d),
	584	'^S' => chr(0xc5).chr(0x9c),
	585	'^u' => chr(0xc3).chr(0xa2),
	586	'^U' => chr(0xc3).chr(0xbb),
	587	'^w' => chr(0xc5).chr(0xb5),
	588	'^W' => chr(0xc5).chr(0xb4),
	589	'^y' => chr(0xc5).chr(0xb7),
	590	'^Y' => chr(0xc5).chr(0xb6),
	591
	592	# diaeresis
	593	'"a' => chr(0xc3).chr(0xa4),
	594	'"A' => chr(0xc3).chr(0x84),
	595	'"e' => chr(0xc3).chr(0xab),
	596	'"E' => chr(0xc3).chr(0x8b),
	597	'"\\\\i' => chr(0xc3).chr(0xaf),
	598	'"\\\\I' => chr(0xc3).chr(0x8f),
	599	'"o' => chr(0xc3).chr(0xb6),
	600	'"O' => chr(0xc3).chr(0x96),
	601	'"u' => chr(0xc3).chr(0xbc),
	602	'"U' => chr(0xc3).chr(0x9c),
	603	'"y' => chr(0xc3).chr(0xbf),
	604	'"Y' => chr(0xc3).chr(0xb8),
	605	# tilde
[3142]	606	'~A' => chr(0xc3).chr(0x83),
	607	'~N' => chr(0xc3).chr(0x91),
	608	'~O' => chr(0xc3).chr(0x95),
	609	'~a' => chr(0xc3).chr(0xa3),
	610	'~n' => chr(0xc3).chr(0xb1),
	611	'~o' => chr(0xc3).chr(0xb5),
[2901]	612	# caron - handled specially
	613	# double acute
	614	# ring
	615	# dot
[3249]	616	'.c' => chr(0xc4).chr(0x8b),
	617	'.C' => chr(0xc4).chr(0x8a),
	618	'.e' => chr(0xc4).chr(0x97),
	619	'.E' => chr(0xc4).chr(0x96),
	620	'.g' => chr(0xc4).chr(0xa1),
	621	'.G' => chr(0xc4).chr(0xa0),
	622	'.I' => chr(0xc4).chr(0xb0),
	623	'.z' => chr(0xc5).chr(0xbc),
	624	'.Z' => chr(0xc5).chr(0xbb),
[2901]	625	# macron
	626	'=a' => chr(0xc4).chr(0x81),
	627	'=A' => chr(0xc4).chr(0x80),
	628	'=e' => chr(0xc4).chr(0x93),
	629	'=E' => chr(0xc4).chr(0x92),
	630	'=i' => chr(0xc4).chr(0xab),
	631	'=I' => chr(0xc4).chr(0xaa),
	632	'=o' => chr(0xc4).chr(0x8d),
	633	'=O' => chr(0xc4).chr(0x8c),
	634	'=u' => chr(0xc4).chr(0xab),
	635	'=U' => chr(0xc4).chr(0xaa),
	636
	637	# stroke - handled specially - see below
	638
	639	# cedilla - handled specially
	640	);
	641
	642	# these are one letter latex commands - we make sure they're not a longer
	643	# command name. eg {\d} is d+stroke, so careful of \d
	644	my %special_utf8_chars =
	645	(
[3156]	646	# breve
	647	'u g' => chr(0xc4).chr(0x9f),
	648	'u G' => chr(0xc4).chr(0x9e),
	649	'u i' => chr(0xc4).chr(0xad),
	650	'u I' => chr(0xc4).chr(0xac),
	651	'u o' => chr(0xc5).chr(0x8f),
	652	'u O' => chr(0xc5).chr(0x8e),
	653	'u u' => chr(0xc5).chr(0xad),
	654	'u U' => chr(0xc5).chr(0xac),
[2901]	655	# caron
[3156]	656	'v c' => chr(0xc4).chr(0x8d),
	657	'v C' => chr(0xc4).chr(0x8c),
[2901]	658	'v n' => chr(0xc5).chr(0x88),
	659	'v N' => chr(0xc5).chr(0x87),
	660	'v s' => chr(0xc5).chr(0xa1),
	661	'v S' => chr(0xc5).chr(0xa5),
[3156]	662	'v z' => chr(0xc5).chr(0xbe),
	663	'v Z' => chr(0xc5).chr(0xbd),
[2901]	664	# cedilla
	665	'c c' => chr(0xc3).chr(0xa7),
	666	'c C' => chr(0xc3).chr(0x87),
	667	'c g' => chr(0xc4).chr(0xa3),
	668	'c G' => chr(0xc4).chr(0xa2),
	669	'c k' => chr(0xc4).chr(0xb7),
	670	'c K' => chr(0xc4).chr(0xb6),
	671	'c l' => chr(0xc4).chr(0xbc),
	672	'c L' => chr(0xc4).chr(0xbb),
	673	'c n' => chr(0xc5).chr(0x86),
	674	'c N' => chr(0xc5).chr(0x85),
	675	'c r' => chr(0xc5).chr(0x97),
	676	'c R' => chr(0xc5).chr(0x96),
	677	'c s' => chr(0xc5).chr(0x9f),
	678	'c S' => chr(0xc5).chr(0x9e),
	679	'c t' => chr(0xc5).chr(0xa3),
	680	'c T' => chr(0xc5).chr(0xa2),
	681	# double acute / Hungarian accent
	682	'H O' => chr(0xc5).chr(0x90),
	683	'H o' => chr(0xc5).chr(0x91),
	684	'H U' => chr(0xc5).chr(0xb0),
	685	'H u' => chr(0xc5).chr(0xb1),
	686
	687	# stroke
	688	'd' => chr(0xc4).chr(0x91),
	689	'D' => chr(0xc4).chr(0x90),
	690	'h' => chr(0xc4).chr(0xa7),
	691	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	692	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	693	'l' => chr(0xc5).chr(0x82),
	694	'L' => chr(0xc5).chr(0x81),
	695	'o' => chr(0xc3).chr(0xb8),
	696	'O' => chr(0xc3).chr(0x98),
	697	't' => chr(0xc5).chr(0xa7),
	698	'T' => chr(0xc5).chr(0xa6),
	699	# german ss/szlig/sharp s
	700	'ss' => chr(0xc3).chr(0x9f),
	701	);
	702
	703	# convert latex-style accented characters.
[3249]	704
[2901]	705	# remove space (if any) between \ and letter to accent (eg {\' a})
	706
[13413]	707	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
	708
[2901]	709	# remove {} around a single character (eg \'{e})
[13413]	710	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]	711
[3249]	712	# \, is another way of doing cedilla \c
	713	$text =~ s@\\,(.)@\\c $1@g;
	714
[2901]	715	# remove {} around a single character for special 1 letter commands -
	716	# need to insert a space. Eg \v{s} -> {\v s}
[3156]	717	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]	718
[2901]	719	# only do if the text contains a '\' character.
	720	if ($text =~ m\|\\\|) {
[3249]	721	# "normal" accents - ie non-alpha latex tag
[13413]	722	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]	723	my $tex="$1$2"; my $char="$2";
	724	my $replacement=$utf8_chars{$tex};
	725	if (!defined($replacement)) {
	726	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	727	$replacement=$char;
	728	}
[5454]	729	$text =~ s/\\\Q$tex/$replacement/g;
[3249]	730	}
[2901]	731
[3249]	732	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	733	# only do the change if immediately followed by a space, }, {, or \
	734	# one letter accents ( + ss)
	735	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
	736	my $tex=$1;
	737	my $replacement=$special_utf8_chars{$tex};
	738	if (!defined($replacement)) {
	739	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	740	$replacement=$tex;
	741	}
	742	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	743
	744	}
	745
	746	# one letter latex accent commands that affect following letter
	747	while ($text =~ m@\\([uvcH]) ([\w])@) {
	748	my $tex="$1 $2"; my $char="$2";
	749	my $replacement=$special_utf8_chars{$tex};
	750	if (!defined($replacement)) {
	751	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	752	$replacement=$char;
	753	}
	754	$text =~ s/\\$tex/$replacement/g;
	755	}
[2901]	756	}
[3249]	757
	758	# escape html-sensitive characters
	759	$text =~ s@&@&@g;
	760	$text =~ s@<@<@g;
	761	$text =~ s@>@>@g;
	762	$text =~ s/''/"/g; # Latex-specific
	763	$text =~ s/``/"/g; # Latex-specific
	764	# greenstone-specific
	765	$text =~ s@\[@&\#91;@g;
	766	$text =~ s@\]@&\#93;@g;
	767
[3112]	768	# remove latex commands
[3249]	769
	770	# explicitly recognised commands
	771	$text =~ s@\\ldots@…@g;
	772
	773	# maths mode
	774	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
	775
	776	# remove all other commands with optional arguments...
[3156]	777	$text =~ s@\\\w+(\[.?\])?\s@@g;
	778	# $text =~ s@\\noopsort{[^}]+\}@@g;
	779	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
[3142]	780
[2901]	781	# remove latex groupings { } (but not \{ or \} )
[3249]	782	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]	783	$text =~ s/^\{//; # remove { if first char
	784
	785	# latex characters
	786	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]	787	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]	788	# optional break "\-"
[3249]	789	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]	790	# the non-macro bits have quotes around them - we just remove quotes
[3249]	791	$text =~ s/([^&])[\"\#]/$1/g;
[3142]	792	}
[3249]	793	# dashes. Convert (m\|n)-dash into single dash for html.
	794	$text =~ s@\-\-+@\-@g;
	795
[2901]	796	# quoted { } chars
	797	$text =~ s@\\{@{@g;
	798	$text =~ s@\\}@}@g;
	799
[3249]	800	# finally to protect against macro language...
	801	$text =~ s@\\@\\\\@g;
	802
[2901]	803	return $text;
	804	}
	805
	806
[3249]	807	sub process_latex_math {
	808	my $text = shift;
	809
	810	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
	811	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
	812	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
	813	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
	814	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
	815
	816	# put all other command names in italics
	817	$text =~ s@\\([\w]+)@<i>$1</i>@g;
	818
	819	return $text;
	820	}
	821
[2484]	822	sub set_OID {
	823	my $self = shift (@_);
	824	my ($doc_obj, $id, $segment_number) = @_;
	825
	826	if ( $self->{'key'} eq "default") {
[4792]	827	$doc_obj->set_OID("$id\_$segment_number");
[2484]	828	} else {
	829	$doc_obj->set_OID($self->{'key'});
	830	}
	831	}
[1676]	832
[2484]	833	1;
[4792]	834
	835

Note: See TracBrowser for help on using the repository browser.

Download in other formats: