Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 13413

Last change on this file since 13413 was 13413, checked in by kjdon, 17 years ago
added some special handling for year so that it only gets 4 digits. Also added some comments to keep emacs colouring working properly
Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
	3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
	29	# BibTexPlug reads bibliography files in BibTex format.
	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
	34	# BibTexPlug creates a document object for every reference a the file.
	35	# It is a subclass of SplitPlug, so if there are multiple records, all
	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
[1676]	42
[3587]	43
[1676]	44	package BibTexPlug;
	45
	46	use SplitPlug;
[10254]	47	use strict;
	48	no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]	49
	50	# BibTexPlug is a sub-class of BasPlug.
	51	sub BEGIN {
[10254]	52	@BibTexPlug::ISA = ('SplitPlug');
[1676]	53	}
	54
[3540]	55	my $arguments =
[4744]	56	[ { 'name' => "process_exp",
[4873]	57	'desc' => "{BasPlug.process_exp}",
[6408]	58	'type' => "regexp",
[4873]	59	'reqd' => "no",
[6408]	60	'deft' => &get_default_process_exp() },
	61	{ 'name' => "split_exp",
	62	'desc' => "{SplitPlug.split_exp}",
	63	'type' => "regexp",
	64	'deft' => &get_default_split_exp(),
	65	'reqd' => "no" }
	66	];
[3540]	67
[4744]	68	my $options = { 'name' => "BibTexPlug",
[5680]	69	'desc' => "{BibTexPlug.desc}",
[6408]	70	'abstract' => "no",
	71	'inherits' => "yes",
[9120]	72	'explodes' => "yes",
[4744]	73	'args' => $arguments };
[3540]	74
[1676]	75	# This plugin processes files with the suffix ".bib"
	76	sub get_default_process_exp {
	77	return q^(?i)\.bib$^;
	78	}
	79
	80	# This plugin splits the input text at blank lines
	81	sub get_default_split_exp {
	82	return q^\n+(?=@)^;
	83	}
[3540]	84	sub new {
[10218]	85	my ($class) = shift (@_);
	86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	87	push(@$pluginlist, $class);
	88
	89	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	90	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	91
[12169]	92	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
[10218]	93
[3540]	94	return bless $self, $class;
	95	}
[1676]	96
[2901]	97
	98
[1676]	99	# The process function reads a single bibliographic record and stores
	100	# it as a new document.
	101
	102	sub process {
	103	my $self = shift (@_);
[6332]	104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]	105	my $outhandle = $self->{'outhandle'};
	106
[10254]	107	my $cursection = $doc_obj->get_top_section();
[2484]	108	$self->{'key'} = "default";
	109
[1676]	110	# Check that we're dealing with a valid BibTex record
	111	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	112
	113	# Ignore things we can't use
	114	return 0 if ($$textref =~ /^\@String/);
	115
	116	# Report that we're processing the file
[6332]	117	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
[1676]	118	print $outhandle "BibTexPlug: processing $file\n"
	119	if ($self->{'verbosity'}) > 1;
	120
	121
[1711]	122	# This hash translates BibTex field names into metadata names. The
[1676]	123	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	124	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	125
[9582]	126	# The author metadata will be stored as one "Creator" entry, but will
	127	# also be split up into several individual "Author" fields.
	128
[1676]	129	my %field = (
	130	'address', 'PublisherAddress',
	131	'author', 'Creator',
[2901]	132
[1676]	133	'booktitle', 'Booktitle',
	134	'chapter', 'Chapter',
	135	'edition', 'Edition',
[2901]	136	'editor', 'Editor',
[1676]	137	'institution', 'Publisher',
	138	'journal', 'Journal',
	139	'month', 'Month',
	140	'number', 'Number',
	141	'pages', 'Pages',
	142	'publisher', 'Publisher',
	143	'school', 'Publisher',
	144	'title', 'Title',
	145	'volume', 'Volume',
[3142]	146	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]	147
	148	'keywords', 'Keywords',
	149	'abstract', 'Abstract',
[4792]	150	'copyright', 'Copyright',
	151	'note', 'Note',
[11527]	152	'url', 'URL',
[8121]	153	);
[1676]	154
	155	# Metadata fields
	156	my %metadata;
[4792]	157	my ($EntryType, $Creator, $Keywords, $text);
[1676]	158
[2901]	159	my $verbosity = $self->{'verbosity'};
	160	$verbosity = 0 unless $verbosity;
	161
[3142]	162	# Make sure the text has exactly one entry per line -
	163	# append line to previous if it doesn't start with " <key> = "
[2901]	164
[4792]	165	my @input_lines=split(/\r?\n/, $$textref);
[3142]	166	my @all_lines;
	167	my $entry_line=shift @input_lines;
	168	foreach my $input_line (@input_lines) {
	169	if ($input_line =~ m/^\s\w+\s=\s*/) {
	170	# this is a new key
	171	push(@all_lines, $entry_line);
	172	$entry_line=$input_line;
	173	} else {
	174	# this is a continuation of previous line
[3249]	175	$entry_line .= " " . $input_line;
[3142]	176	}
	177
	178	}
	179	# add final line, removing trailing '}'
	180	$entry_line =~ s/\}\s*$//;
	181	push(@all_lines, $entry_line);
	182	push(@all_lines, "}");
[2901]	183
[1676]	184	# Read and process each line in the bib file.
[2901]	185	my ($entryname, $name, $value, $line);
	186	foreach $line (@all_lines) {
	187
[1676]	188	# Add each line. Most lines consist of a field identifer and
	189	# then data, and we simply store them, though we treat some
	190	# of the fields a bit differently.
	191
	192	$line =~ s/\s+/ /g;
	193	$text .= "$line\n";
	194
[4792]	195
[2901]	196	print "Processing line = $line \n" if $verbosity>=4;
	197
[1676]	198	# The first line is special, it contains the reference type and OID
[4792]	199	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
[1676]	200	$EntryType = $1;
[4792]	201	my $EntryID = (defined $2) ? $2 : "default";
[2484]	202	print "** $EntryType - \"$EntryID\" \n"
	203	if ($verbosity >= 4);
[4792]	204
[1676]	205	next;
	206	}
[2484]	207	if ($line =~ /\@/) {
[2901]	208	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	209	if ($verbosity >= 2);
[2901]	210	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	211	if ($verbosity >= 2);
	212	}
[1676]	213
	214	# otherwise, parse the metadata out of this line
[9582]	215	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
[2901]	216	$entryname = lc($1);
[1676]	217	$value = $2;
[3112]	218	$value =~ s/,?\s*$//; # remove trailing comma and space
	219	if ($value =~ /^"/ && $value =~ /"$/) {
	220	# remove surrounding " marks
	221	$value =~ s/^"//; $value =~ s/"$//;
	222	}
[13413]	223
	224	# special case for year - we only want a 4 digit year, not
	225	# "circa 2004" etc
	226	if ($entryname=~ /^year$/i) {
	227	if ($value=~ /(\d{4})/) {
	228	$value=$1;
	229	$metadata{$entryname} .= "$value";
	230	}
	231	}
	232	else {
	233
	234	$value = &process_latex($value);
	235	# Add this line of metadata
	236	$metadata{$entryname} .= "$value";
	237	}
[1676]	238	}
	239
[1677]	240	# Add the Entry type as metadata
[2901]	241	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]	242
	243	#Add the fileformat as metadata
	244	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]	245
[8121]	246
[1676]	247	# Add the various field as metadata
[2901]	248	foreach my $entryname (keys %metadata) {
	249	next unless (defined $field{$entryname});
	250	next unless (defined $metadata{$entryname});
[1676]	251
[2901]	252	$name = $field{$entryname};
	253	$value = $metadata{$entryname};
[1676]	254
[3142]	255	if ($name =~ /^Month/) {
	256	$value=expand_month($value);
	257	}
[1676]	258
	259	# Several special operatons on metadata follow
	260
	261	# Add individual keywords.
	262	# The full set of keywords will be added, in due course, as "Keywords".
	263	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	264	if ($entryname eq "keywords") {
[1676]	265	my @keywordlist = split(/,/, $value);
	266	foreach my $k (@keywordlist) {
[2901]	267	$k = lc($k);
[1676]	268	$k =~ s/\s*$//;
	269	$k =~ s/^\s*//;
	270	if ($k =~ /\w/) {
[2901]	271	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	272	}
	273	}
	274	}
	275
	276	# Add individual authors
	277	# The author metadata will be stored as one "Creator" entry, but we
	278	# also want to split it into several individual "Author" fields in
[3112]	279	# "Lastname, Firstnames" format so we can browse it.
[2901]	280	if ($entryname eq "author") { #added also comparison with editor
	281
[3249]	282	# take care of "et al."...
[9582]	283	my $etal='';
	284	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
	285	$etal=' <em>et. al.</em>';
	286	}
[2901]	287	# und here for german language...
	288	# don't use brackets in pattern, else the matched bit becomes
	289	# an element in the list!
[3249]	290	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
	291	my @formattedlist = ();
[1676]	292	foreach $a (@authorlist) {
	293	$a =~ s/\s*$//;
	294	$a =~ s/^\s*//;
	295	# Reformat and add author name
[2901]	296	next if $a=~ /^\s*$/;
[3249]	297
	298	# names are "First von Last", "von Last, First"
	299	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	300	my $first="";
	301	my $vonlast="";
	302	my $jr="";
[2901]	303
[3249]	304	if ($a =~ /,/) {
	305	my @parts=split(/,\s*/, $a);
	306	$first = pop @parts;
	307	if (scalar(@parts) == 2) {
	308	$jr = pop @parts;
	309	}
	310	$vonlast=shift @parts;
	311	if (scalar(@parts) > 0) {
	312	print $outhandle "BibTexPlug: couldn't parse name $a\n";
	313	# but we continue anyway...
	314	}
	315	} else { # First von Last
	316	my @words = split(/ /, $a);
	317	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
	318	$first .= " " . shift (@words);
	319	}
	320	$first =~ s/^\s//;
	321	$vonlast = join (' ', @words); # whatever's left...
	322	}
	323	my $von="";
	324	my $last="";
	325	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	326	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	327	$von = $1;
	328	if (!defined ($von)) {
	329	# some non-English names do start with lowercase
	330	# eg "Marie desJardins". Also we can get typos...
	331	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
	332	$von="";
	333	if ($vonlast =~ /^[a-z]+$/) {
	334	# if it's all lowercase, uppercase 1st.
	335	$vonlast =~ s/^(.)/\u$1/;
[2901]	336
[3249]	337	}
	338	}
	339	$von =~ s/\s*$//;
	340	$last=$vonlast;
	341	} else {
	342	$last=$vonlast;
	343	}
	344	my $wholename="$first $von $last $jr";
[9582]	345	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
	346	$wholename =~ s/ $//;
[3249]	347	push (@formattedlist, $wholename);
	348	my $fullname = "$last";
	349	$fullname .= " $jr" if ($jr);
	350	$fullname .= ", $first";
	351	$fullname .= " $von" if ($von);
	352
[1676]	353	# Add each name to set of Authors
[3249]	354	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	355	}
[3249]	356
	357	# Only want at most one "and" in the Creator field
	358	if (scalar(@formattedlist) > 2) {
	359	my $lastauthor=pop @formattedlist;
	360	$value=join(', ', @formattedlist);
	361	$value.=" and $lastauthor";
	362	} else { # 1 or 2 authors...
	363	$value=join(" and ",@formattedlist);
	364	}
	365	$value.=$etal; # if there was "et al."
[1676]	366	}
	367
	368	# Books and Journals are additionally marked for display purposes
[2901]	369	if ($entryname eq "booktitle") {
	370	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	371	} elsif ($entryname eq "journal") {
	372	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	373	}
	374
[3249]	375	# Add the various fields as metadata
	376	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]	377
[1676]	378	}
	379
[9582]	380	# for books and journals...
	381	if (!exists $metadata{'title'}) {
	382	my $name=$field{'title'}; # get Greenstone metadata name
	383	my $value;
	384	if (exists $metadata{'booktitle'}) {
	385	$value=$metadata{'booktitle'};
	386	} elsif (exists $metadata{'journal'}) {
	387	$value=$metadata{'journal'};
	388	}
	389	if ($value) {
	390	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
	391	}
	392	}
	393
[3249]	394	# Add Date (yyyymmdd) metadata
	395	if (defined ($metadata{'year'}) ) {
	396	my $date=$metadata{'year'};
	397	chomp $date;
	398	my $month=$metadata{'month'};
	399	if (defined($month)) {
	400	# month is currently 3 letter code or a range...
	401	$month = expand_month($month);
	402	# take the first month found... might not find one!
	403	$month =~ m/_textmonth(\d\d)_/;
	404	$month = $1;
	405	}
	406	if (!defined($month)) {
	407	$month="00";
	408	}
	409	$date .= "${month}00";
	410	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	411	}
	412
	413	# # Add the text in BibTex format (all fields)
[1676]	414	if ($text =~ /\w/) {
[3249]	415
	416	$text =~ s@&@&@g;
	417	$text =~ s@<@<@g;
	418	$text =~ s@>@>@g;
	419	$text =~ s@\n@<br/>\n@g;
	420	$text =~ s@\\@\\\\@g;
	421
[2901]	422	$doc_obj->add_utf8_text ($cursection, $text);
[1676]	423	}
	424
[1677]	425	return 1;
[1676]	426	}
	427
[1677]	428
[2901]	429
	430
	431	# convert email addresses and URLs into links
	432	sub convert_urls_into_links{
	433	my ($text) = @_;
	434
	435	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]	436	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]	437
	438	return $text;
	439	}
	440
	441	# Clean up whitespace and convert \n charaters to <BR> or <P>
	442	sub clean_up_whitespaces{
	443	my ($text) = @_;
	444
	445	$text =~ s/%%%%%/<BR> <BR>/g;
	446	$text =~ s/ +/ /g;
	447	$text =~ s/\s*$//;
	448	$text =~ s/^\s*//;
	449	$text =~ s/\n/\n<BR>/g;
	450	$text =~ s/<BR>\s*<BR>/<P>/g;
	451
	452	return $text;
	453	}
	454
	455
	456	sub convert_problem_characters_without_ampersand{
	457	my ($text) = @_;
	458	$text =~ s/</</g;
	459	$text =~ s/>/>/g;
	460
	461	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	462	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	463
	464	$text =~ s/\"/"/g;
	465	$text =~ s/\'/’/g;
	466	$text =~ s/\`/‘/g;
	467
[3249]	468	# $text =~ s/\+/ /g;
	469	# $text =~ s/\(/ /g;
	470	# $text =~ s/\)/ /g;
	471
[2901]	472	$text =~ s/\\/\\\\/g;
	473
[3249]	474	# $text =~ s/\./\\\./g;
[2901]	475
	476	return $text;
	477	}
	478
[1676]	479	# Convert a text string into HTML.
	480
[1677]	481	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	482	# careful not to use symbols like ">", which occurs frequently in email
[1677]	483	# messages (and use &gt instead.
	484
	485	# This function also turns URLs and email addresses into links, and
	486	# replaces carriage returns with <BR> tags (and multiple carriage returns
	487	# with <P> tags).
	488
[1676]	489	sub text_into_html {
	490	my ($text) = @_;
	491
[2901]	492	# Convert problem characters into HTML symbols
[1676]	493	$text =~ s/&/&/g;
	494
[2901]	495	$text = &convert_problem_characters_without_ampersand( $text );
	496
[1676]	497	# convert email addresses and URLs into links
[2901]	498	$text = &convert_urls_into_links( $text );
[1676]	499
[2901]	500	$text = &clean_up_whitespaces( $text );
[1676]	501
	502	return $text;
	503	}
	504
[2901]	505
[3142]	506	sub expand_month {
	507	my $text=shift;
[2901]	508
[3142]	509	# bibtex style files expand abbreviations for months.
	510	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	511	$text =~ s/jan/_textmonth01_/g;
	512	$text =~ s/feb/_textmonth02_/g;
	513	$text =~ s/mar/_textmonth03_/g;
	514	$text =~ s/apr/_textmonth04_/g;
	515	$text =~ s/may/_textmonth05_/g;
	516	$text =~ s/jun/_textmonth06_/g;
	517	$text =~ s/jul/_textmonth07_/g;
	518	$text =~ s/aug/_textmonth08_/g;
	519	$text =~ s/sep/_textmonth09_/g;
	520	$text =~ s/oct/_textmonth10_/g;
	521	$text =~ s/nov/_textmonth11_/g;
	522	$text =~ s/dec/_textmonth12_/g;
[2901]	523
[3142]	524	return $text;
	525	}
	526
	527
[2901]	528	# Convert accented characters, remove { }, interprete some commands....
	529	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	530	sub process_latex {
	531	my ($text) = @_;
	532
	533	# note - this is really ugly, but it works. There may be a prettier way
	534	# of mapping latex accented chars to utf8, but we just brute force it here.
	535	# Also, this isn't complete - not every single possible accented letter
	536	# is in here yet, but most of the common ones are.
	537
	538	my %utf8_chars =
	539	(
	540	# acutes
	541	'\'a' => chr(0xc3).chr(0xa1),
	542	'\'c' => chr(0xc4).chr(0x87),
	543	'\'e' => chr(0xc3).chr(0xa9),
	544	'\'i' => chr(0xc3).chr(0xad),
	545	'\'l' => chr(0xc3).chr(0xba),
	546	'\'n' => chr(0xc3).chr(0x84),
	547	'\'o' => chr(0xc3).chr(0xb3),
	548	'\'r' => chr(0xc5).chr(0x95),
	549	'\'s' => chr(0xc5).chr(0x9b),
	550	'\'u' => chr(0xc3).chr(0xba),
	551	'\'y' => chr(0xc3).chr(0xbd),
	552	'\'z' => chr(0xc5).chr(0xba),
	553	# graves
	554	'`a' => chr(0xc3).chr(0xa0),
	555	'`A' => chr(0xc3).chr(0x80),
	556	'`e' => chr(0xc3).chr(0xa8),
	557	'`E' => chr(0xc3).chr(0x88),
	558	'`i' => chr(0xc3).chr(0xac),
	559	'`I' => chr(0xc3).chr(0x8c),
	560	'`o' => chr(0xc3).chr(0xb2),
	561	'`O' => chr(0xc3).chr(0x92),
	562	'`u' => chr(0xc3).chr(0xb9),
	563	'`U' => chr(0xc3).chr(0x99),
	564	# circumflex
	565	'^a' => chr(0xc3).chr(0xa2),
	566	'^A' => chr(0xc3).chr(0x82),
	567	'^c' => chr(0xc4).chr(0x89),
	568	'^C' => chr(0xc4).chr(0x88),
	569	'^e' => chr(0xc3).chr(0xaa),
	570	'^E' => chr(0xc3).chr(0x8a),
	571	'^g' => chr(0xc4).chr(0x9d),
	572	'^G' => chr(0xc4).chr(0x9c),
	573	'^h' => chr(0xc4).chr(0xa5),
	574	'^H' => chr(0xc4).chr(0xa4),
	575	'^i' => chr(0xc3).chr(0xae),
	576	'^I' => chr(0xc3).chr(0x8e),
	577	'^j' => chr(0xc4).chr(0xb5),
	578	'^J' => chr(0xc4).chr(0xb4),
	579	'^o' => chr(0xc3).chr(0xb4),
	580	'^O' => chr(0xc3).chr(0x94),
	581	'^s' => chr(0xc5).chr(0x9d),
	582	'^S' => chr(0xc5).chr(0x9c),
	583	'^u' => chr(0xc3).chr(0xa2),
	584	'^U' => chr(0xc3).chr(0xbb),
	585	'^w' => chr(0xc5).chr(0xb5),
	586	'^W' => chr(0xc5).chr(0xb4),
	587	'^y' => chr(0xc5).chr(0xb7),
	588	'^Y' => chr(0xc5).chr(0xb6),
	589
	590	# diaeresis
	591	'"a' => chr(0xc3).chr(0xa4),
	592	'"A' => chr(0xc3).chr(0x84),
	593	'"e' => chr(0xc3).chr(0xab),
	594	'"E' => chr(0xc3).chr(0x8b),
	595	'"\\\\i' => chr(0xc3).chr(0xaf),
	596	'"\\\\I' => chr(0xc3).chr(0x8f),
	597	'"o' => chr(0xc3).chr(0xb6),
	598	'"O' => chr(0xc3).chr(0x96),
	599	'"u' => chr(0xc3).chr(0xbc),
	600	'"U' => chr(0xc3).chr(0x9c),
	601	'"y' => chr(0xc3).chr(0xbf),
	602	'"Y' => chr(0xc3).chr(0xb8),
	603	# tilde
[3142]	604	'~A' => chr(0xc3).chr(0x83),
	605	'~N' => chr(0xc3).chr(0x91),
	606	'~O' => chr(0xc3).chr(0x95),
	607	'~a' => chr(0xc3).chr(0xa3),
	608	'~n' => chr(0xc3).chr(0xb1),
	609	'~o' => chr(0xc3).chr(0xb5),
[2901]	610	# caron - handled specially
	611	# double acute
	612	# ring
	613	# dot
[3249]	614	'.c' => chr(0xc4).chr(0x8b),
	615	'.C' => chr(0xc4).chr(0x8a),
	616	'.e' => chr(0xc4).chr(0x97),
	617	'.E' => chr(0xc4).chr(0x96),
	618	'.g' => chr(0xc4).chr(0xa1),
	619	'.G' => chr(0xc4).chr(0xa0),
	620	'.I' => chr(0xc4).chr(0xb0),
	621	'.z' => chr(0xc5).chr(0xbc),
	622	'.Z' => chr(0xc5).chr(0xbb),
[2901]	623	# macron
	624	'=a' => chr(0xc4).chr(0x81),
	625	'=A' => chr(0xc4).chr(0x80),
	626	'=e' => chr(0xc4).chr(0x93),
	627	'=E' => chr(0xc4).chr(0x92),
	628	'=i' => chr(0xc4).chr(0xab),
	629	'=I' => chr(0xc4).chr(0xaa),
	630	'=o' => chr(0xc4).chr(0x8d),
	631	'=O' => chr(0xc4).chr(0x8c),
	632	'=u' => chr(0xc4).chr(0xab),
	633	'=U' => chr(0xc4).chr(0xaa),
	634
	635	# stroke - handled specially - see below
	636
	637	# cedilla - handled specially
	638	);
	639
	640	# these are one letter latex commands - we make sure they're not a longer
	641	# command name. eg {\d} is d+stroke, so careful of \d
	642	my %special_utf8_chars =
	643	(
[3156]	644	# breve
	645	'u g' => chr(0xc4).chr(0x9f),
	646	'u G' => chr(0xc4).chr(0x9e),
	647	'u i' => chr(0xc4).chr(0xad),
	648	'u I' => chr(0xc4).chr(0xac),
	649	'u o' => chr(0xc5).chr(0x8f),
	650	'u O' => chr(0xc5).chr(0x8e),
	651	'u u' => chr(0xc5).chr(0xad),
	652	'u U' => chr(0xc5).chr(0xac),
[2901]	653	# caron
[3156]	654	'v c' => chr(0xc4).chr(0x8d),
	655	'v C' => chr(0xc4).chr(0x8c),
[2901]	656	'v n' => chr(0xc5).chr(0x88),
	657	'v N' => chr(0xc5).chr(0x87),
	658	'v s' => chr(0xc5).chr(0xa1),
	659	'v S' => chr(0xc5).chr(0xa5),
[3156]	660	'v z' => chr(0xc5).chr(0xbe),
	661	'v Z' => chr(0xc5).chr(0xbd),
[2901]	662	# cedilla
	663	'c c' => chr(0xc3).chr(0xa7),
	664	'c C' => chr(0xc3).chr(0x87),
	665	'c g' => chr(0xc4).chr(0xa3),
	666	'c G' => chr(0xc4).chr(0xa2),
	667	'c k' => chr(0xc4).chr(0xb7),
	668	'c K' => chr(0xc4).chr(0xb6),
	669	'c l' => chr(0xc4).chr(0xbc),
	670	'c L' => chr(0xc4).chr(0xbb),
	671	'c n' => chr(0xc5).chr(0x86),
	672	'c N' => chr(0xc5).chr(0x85),
	673	'c r' => chr(0xc5).chr(0x97),
	674	'c R' => chr(0xc5).chr(0x96),
	675	'c s' => chr(0xc5).chr(0x9f),
	676	'c S' => chr(0xc5).chr(0x9e),
	677	'c t' => chr(0xc5).chr(0xa3),
	678	'c T' => chr(0xc5).chr(0xa2),
	679	# double acute / Hungarian accent
	680	'H O' => chr(0xc5).chr(0x90),
	681	'H o' => chr(0xc5).chr(0x91),
	682	'H U' => chr(0xc5).chr(0xb0),
	683	'H u' => chr(0xc5).chr(0xb1),
	684
	685	# stroke
	686	'd' => chr(0xc4).chr(0x91),
	687	'D' => chr(0xc4).chr(0x90),
	688	'h' => chr(0xc4).chr(0xa7),
	689	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	690	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	691	'l' => chr(0xc5).chr(0x82),
	692	'L' => chr(0xc5).chr(0x81),
	693	'o' => chr(0xc3).chr(0xb8),
	694	'O' => chr(0xc3).chr(0x98),
	695	't' => chr(0xc5).chr(0xa7),
	696	'T' => chr(0xc5).chr(0xa6),
	697	# german ss/szlig/sharp s
	698	'ss' => chr(0xc3).chr(0x9f),
	699	);
	700
	701	# convert latex-style accented characters.
[3249]	702
[2901]	703	# remove space (if any) between \ and letter to accent (eg {\' a})
	704
[13413]	705	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
	706
[2901]	707	# remove {} around a single character (eg \'{e})
[13413]	708	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]	709
[3249]	710	# \, is another way of doing cedilla \c
	711	$text =~ s@\\,(.)@\\c $1@g;
	712
[2901]	713	# remove {} around a single character for special 1 letter commands -
	714	# need to insert a space. Eg \v{s} -> {\v s}
[3156]	715	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]	716
[2901]	717	# only do if the text contains a '\' character.
	718	if ($text =~ m\|\\\|) {
[3249]	719	# "normal" accents - ie non-alpha latex tag
[13413]	720	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]	721	my $tex="$1$2"; my $char="$2";
	722	my $replacement=$utf8_chars{$tex};
	723	if (!defined($replacement)) {
	724	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	725	$replacement=$char;
	726	}
[5454]	727	$text =~ s/\\\Q$tex/$replacement/g;
[3249]	728	}
[2901]	729
[3249]	730	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	731	# only do the change if immediately followed by a space, }, {, or \
	732	# one letter accents ( + ss)
	733	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
	734	my $tex=$1;
	735	my $replacement=$special_utf8_chars{$tex};
	736	if (!defined($replacement)) {
	737	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	738	$replacement=$tex;
	739	}
	740	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	741
	742	}
	743
	744	# one letter latex accent commands that affect following letter
	745	while ($text =~ m@\\([uvcH]) ([\w])@) {
	746	my $tex="$1 $2"; my $char="$2";
	747	my $replacement=$special_utf8_chars{$tex};
	748	if (!defined($replacement)) {
	749	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
	750	$replacement=$char;
	751	}
	752	$text =~ s/\\$tex/$replacement/g;
	753	}
[2901]	754	}
[3249]	755
	756	# escape html-sensitive characters
	757	$text =~ s@&@&@g;
	758	$text =~ s@<@<@g;
	759	$text =~ s@>@>@g;
	760	$text =~ s/''/"/g; # Latex-specific
	761	$text =~ s/``/"/g; # Latex-specific
	762	# greenstone-specific
	763	$text =~ s@\[@&\#91;@g;
	764	$text =~ s@\]@&\#93;@g;
	765
[3112]	766	# remove latex commands
[3249]	767
	768	# explicitly recognised commands
	769	$text =~ s@\\ldots@…@g;
	770
	771	# maths mode
	772	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
	773
	774	# remove all other commands with optional arguments...
[3156]	775	$text =~ s@\\\w+(\[.?\])?\s@@g;
	776	# $text =~ s@\\noopsort{[^}]+\}@@g;
	777	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
[3142]	778
[2901]	779	# remove latex groupings { } (but not \{ or \} )
[3249]	780	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]	781	$text =~ s/^\{//; # remove { if first char
	782
	783	# latex characters
	784	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]	785	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]	786	# optional break "\-"
[3249]	787	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]	788	# the non-macro bits have quotes around them - we just remove quotes
[3249]	789	$text =~ s/([^&])[\"\#]/$1/g;
[3142]	790	}
[3249]	791	# dashes. Convert (m\|n)-dash into single dash for html.
	792	$text =~ s@\-\-+@\-@g;
	793
[2901]	794	# quoted { } chars
	795	$text =~ s@\\{@{@g;
	796	$text =~ s@\\}@}@g;
	797
[3249]	798	# finally to protect against macro language...
	799	$text =~ s@\\@\\\\@g;
	800
[2901]	801	return $text;
	802	}
	803
	804
[3249]	805	sub process_latex_math {
	806	my $text = shift;
	807
	808	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
	809	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
	810	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
	811	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
	812	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
	813
	814	# put all other command names in italics
	815	$text =~ s@\\([\w]+)@<i>$1</i>@g;
	816
	817	return $text;
	818	}
	819
[2484]	820	sub set_OID {
	821	my $self = shift (@_);
	822	my ($doc_obj, $id, $segment_number) = @_;
	823
	824	if ( $self->{'key'} eq "default") {
[4792]	825	$doc_obj->set_OID("$id\_$segment_number");
[2484]	826	} else {
	827	$doc_obj->set_OID($self->{'key'});
	828	}
	829	}
[1676]	830
[2484]	831	1;
[4792]	832
	833

Note: See TracBrowser for help on using the repository browser.

Download in other formats: