Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BibTexPlugin.pm@ 32129

Last change on this file since 32129 was 32129, checked in by kjdon, 6 years ago
After () in a regex, {} signifys quantifiers. eg (xx){2,4} - 2-4 occurrences. In later perl versions, it is illegal to have an unescaped { after a ) in a regex. If you actually want to match { you need to escape it. So I have escaped all { following ) in regex
Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
[15872]	3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
[1676]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
[15872]	29	# BibTexPlugin reads bibliography files in BibTex format.
[1676]	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
[15872]	34	# BibTexPlugin creates a document object for every reference a the file.
	35	# It is a subclass of SplitTextFile, so if there are multiple records, all
[1676]	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
[1676]	42
[3587]	43
[15872]	44	package BibTexPlugin;
[1676]	45
[15872]	46	use SplitTextFile;
[24548]	47	use MetadataRead;
[10254]	48	use strict;
	49	no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]	50
[15872]	51	# BibTexPlugin is a sub-class of SplitTextFile.
[1676]	52	sub BEGIN {
[24548]	53	@BibTexPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[1676]	54	}
	55
[3540]	56	my $arguments =
[4744]	57	[ { 'name' => "process_exp",
[31492]	58	'desc' => "{BaseImporter.process_exp}",
[6408]	59	'type' => "regexp",
[4873]	60	'reqd' => "no",
[6408]	61	'deft' => &get_default_process_exp() },
	62	{ 'name' => "split_exp",
[15872]	63	'desc' => "{SplitTextFile.split_exp}",
[6408]	64	'type' => "regexp",
	65	'deft' => &get_default_split_exp(),
	66	'reqd' => "no" }
	67	];
[3540]	68
[15872]	69	my $options = { 'name' => "BibTexPlugin",
	70	'desc' => "{BibTexPlugin.desc}",
[6408]	71	'abstract' => "no",
	72	'inherits' => "yes",
[9120]	73	'explodes' => "yes",
[4744]	74	'args' => $arguments };
[3540]	75
[1676]	76	# This plugin processes files with the suffix ".bib"
	77	sub get_default_process_exp {
	78	return q^(?i)\.bib$^;
	79	}
	80
	81	# This plugin splits the input text at blank lines
	82	sub get_default_split_exp {
	83	return q^\n+(?=@)^;
	84	}
[15872]	85
[3540]	86	sub new {
[10218]	87	my ($class) = shift (@_);
	88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	89	push(@$pluginlist, $class);
	90
[15872]	91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	92	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	93
[15872]	94	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	95
[3540]	96	return bless $self, $class;
	97	}
[1676]	98
[2901]	99
	100
[1676]	101	# The process function reads a single bibliographic record and stores
	102	# it as a new document.
	103
	104	sub process {
	105	my $self = shift (@_);
[6332]	106	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]	107	my $outhandle = $self->{'outhandle'};
	108
[10254]	109	my $cursection = $doc_obj->get_top_section();
[2484]	110	$self->{'key'} = "default";
	111
[1676]	112	# Check that we're dealing with a valid BibTex record
	113	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	114
	115	# Ignore things we can't use
	116	return 0 if ($$textref =~ /^\@String/);
	117
[1711]	118	# This hash translates BibTex field names into metadata names. The
[1676]	119	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	120	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	121
[9582]	122	# The author metadata will be stored as one "Creator" entry, but will
	123	# also be split up into several individual "Author" fields.
	124
[1676]	125	my %field = (
	126	'address', 'PublisherAddress',
	127	'author', 'Creator',
[2901]	128
[1676]	129	'booktitle', 'Booktitle',
	130	'chapter', 'Chapter',
	131	'edition', 'Edition',
[2901]	132	'editor', 'Editor',
[1676]	133	'institution', 'Publisher',
	134	'journal', 'Journal',
	135	'month', 'Month',
	136	'number', 'Number',
	137	'pages', 'Pages',
	138	'publisher', 'Publisher',
	139	'school', 'Publisher',
	140	'title', 'Title',
	141	'volume', 'Volume',
[3142]	142	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]	143
	144	'keywords', 'Keywords',
	145	'abstract', 'Abstract',
[4792]	146	'copyright', 'Copyright',
	147	'note', 'Note',
[11527]	148	'url', 'URL',
[8121]	149	);
[1676]	150
	151	# Metadata fields
	152	my %metadata;
[4792]	153	my ($EntryType, $Creator, $Keywords, $text);
[1676]	154
[2901]	155	my $verbosity = $self->{'verbosity'};
	156	$verbosity = 0 unless $verbosity;
	157
[3142]	158	# Make sure the text has exactly one entry per line -
	159	# append line to previous if it doesn't start with " <key> = "
[2901]	160
[4792]	161	my @input_lines=split(/\r?\n/, $$textref);
[3142]	162	my @all_lines;
	163	my $entry_line=shift @input_lines;
	164	foreach my $input_line (@input_lines) {
	165	if ($input_line =~ m/^\s\w+\s=\s*/) {
	166	# this is a new key
	167	push(@all_lines, $entry_line);
	168	$entry_line=$input_line;
	169	} else {
	170	# this is a continuation of previous line
[3249]	171	$entry_line .= " " . $input_line;
[3142]	172	}
	173
	174	}
	175	# add final line, removing trailing '}'
	176	$entry_line =~ s/\}\s*$//;
	177	push(@all_lines, $entry_line);
	178	push(@all_lines, "}");
[2901]	179
[1676]	180	# Read and process each line in the bib file.
[2901]	181	my ($entryname, $name, $value, $line);
	182	foreach $line (@all_lines) {
	183
[1676]	184	# Add each line. Most lines consist of a field identifer and
	185	# then data, and we simply store them, though we treat some
	186	# of the fields a bit differently.
	187
	188	$line =~ s/\s+/ /g;
	189	$text .= "$line\n";
	190
[4792]	191
[2901]	192	print "Processing line = $line \n" if $verbosity>=4;
	193
[1676]	194	# The first line is special, it contains the reference type and OID
[4792]	195	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
[1676]	196	$EntryType = $1;
[4792]	197	my $EntryID = (defined $2) ? $2 : "default";
[2484]	198	print "** $EntryType - \"$EntryID\" \n"
	199	if ($verbosity >= 4);
[4792]	200
[1676]	201	next;
	202	}
[2484]	203	if ($line =~ /\@/) {
[2901]	204	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	205	if ($verbosity >= 2);
[2901]	206	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	207	if ($verbosity >= 2);
	208	}
[1676]	209
	210	# otherwise, parse the metadata out of this line
[9582]	211	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
[2901]	212	$entryname = lc($1);
[1676]	213	$value = $2;
[3112]	214	$value =~ s/,?\s*$//; # remove trailing comma and space
	215	if ($value =~ /^"/ && $value =~ /"$/) {
	216	# remove surrounding " marks
	217	$value =~ s/^"//; $value =~ s/"$//;
[13416]	218	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
	219	# remove surrounding {} marks
	220	$value =~ s/^\{//; $value =~ s/\}$//;
[3112]	221	}
[13413]	222	# special case for year - we only want a 4 digit year, not
	223	# "circa 2004" etc
	224	if ($entryname=~ /^year$/i) {
	225	if ($value=~ /(\d{4})/) {
	226	$value=$1;
	227	$metadata{$entryname} .= "$value";
	228	}
	229	}
	230	else {
	231
	232	$value = &process_latex($value);
	233	# Add this line of metadata
	234	$metadata{$entryname} .= "$value";
	235	}
[1676]	236	}
	237
[1677]	238	# Add the Entry type as metadata
[2901]	239	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]	240
	241	#Add the fileformat as metadata
	242	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]	243
[8121]	244
[1676]	245	# Add the various field as metadata
[2901]	246	foreach my $entryname (keys %metadata) {
	247	next unless (defined $field{$entryname});
	248	next unless (defined $metadata{$entryname});
[1676]	249
[2901]	250	$name = $field{$entryname};
	251	$value = $metadata{$entryname};
[1676]	252
[3142]	253	if ($name =~ /^Month/) {
	254	$value=expand_month($value);
	255	}
[1676]	256
	257	# Several special operatons on metadata follow
	258
	259	# Add individual keywords.
	260	# The full set of keywords will be added, in due course, as "Keywords".
	261	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	262	if ($entryname eq "keywords") {
[1676]	263	my @keywordlist = split(/,/, $value);
	264	foreach my $k (@keywordlist) {
[2901]	265	$k = lc($k);
[1676]	266	$k =~ s/\s*$//;
	267	$k =~ s/^\s*//;
	268	if ($k =~ /\w/) {
[2901]	269	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	270	}
	271	}
	272	}
	273
	274	# Add individual authors
	275	# The author metadata will be stored as one "Creator" entry, but we
	276	# also want to split it into several individual "Author" fields in
[3112]	277	# "Lastname, Firstnames" format so we can browse it.
[2901]	278	if ($entryname eq "author") { #added also comparison with editor
	279
[3249]	280	# take care of "et al."...
[9582]	281	my $etal='';
	282	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
	283	$etal=' <em>et. al.</em>';
	284	}
[2901]	285	# und here for german language...
	286	# don't use brackets in pattern, else the matched bit becomes
	287	# an element in the list!
[3249]	288	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
	289	my @formattedlist = ();
[1676]	290	foreach $a (@authorlist) {
	291	$a =~ s/\s*$//;
	292	$a =~ s/^\s*//;
	293	# Reformat and add author name
[2901]	294	next if $a=~ /^\s*$/;
[3249]	295
	296	# names are "First von Last", "von Last, First"
	297	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	298	my $first="";
	299	my $vonlast="";
	300	my $jr="";
[2901]	301
[3249]	302	if ($a =~ /,/) {
	303	my @parts=split(/,\s*/, $a);
	304	$first = pop @parts;
	305	if (scalar(@parts) == 2) {
	306	$jr = pop @parts;
	307	}
	308	$vonlast=shift @parts;
	309	if (scalar(@parts) > 0) {
[15872]	310	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
[3249]	311	# but we continue anyway...
	312	}
	313	} else { # First von Last
	314	my @words = split(/ /, $a);
[32122]	315	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2,}/) {
[3249]	316	$first .= " " . shift (@words);
	317	}
	318	$first =~ s/^\s//;
	319	$vonlast = join (' ', @words); # whatever's left...
	320	}
	321	my $von="";
	322	my $last="";
	323	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	324	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	325	$von = $1;
	326	if (!defined ($von)) {
	327	# some non-English names do start with lowercase
	328	# eg "Marie desJardins". Also we can get typos...
[15872]	329	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
[3249]	330	$von="";
	331	if ($vonlast =~ /^[a-z]+$/) {
	332	# if it's all lowercase, uppercase 1st.
	333	$vonlast =~ s/^(.)/\u$1/;
[2901]	334
[3249]	335	}
	336	}
	337	$von =~ s/\s*$//;
	338	$last=$vonlast;
	339	} else {
	340	$last=$vonlast;
	341	}
	342	my $wholename="$first $von $last $jr";
[9582]	343	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
	344	$wholename =~ s/ $//;
[3249]	345	push (@formattedlist, $wholename);
	346	my $fullname = "$last";
	347	$fullname .= " $jr" if ($jr);
	348	$fullname .= ", $first";
	349	$fullname .= " $von" if ($von);
	350
[1676]	351	# Add each name to set of Authors
[3249]	352	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	353	}
[3249]	354
	355	# Only want at most one "and" in the Creator field
	356	if (scalar(@formattedlist) > 2) {
	357	my $lastauthor=pop @formattedlist;
	358	$value=join(', ', @formattedlist);
	359	$value.=" and $lastauthor";
	360	} else { # 1 or 2 authors...
	361	$value=join(" and ",@formattedlist);
	362	}
	363	$value.=$etal; # if there was "et al."
[1676]	364	}
	365
	366	# Books and Journals are additionally marked for display purposes
[2901]	367	if ($entryname eq "booktitle") {
	368	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	369	} elsif ($entryname eq "journal") {
	370	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	371	}
	372
[3249]	373	# Add the various fields as metadata
	374	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]	375
[1676]	376	}
	377
[9582]	378	# for books and journals...
	379	if (!exists $metadata{'title'}) {
	380	my $name=$field{'title'}; # get Greenstone metadata name
	381	my $value;
	382	if (exists $metadata{'booktitle'}) {
	383	$value=$metadata{'booktitle'};
	384	} elsif (exists $metadata{'journal'}) {
	385	$value=$metadata{'journal'};
	386	}
	387	if ($value) {
	388	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
	389	}
	390	}
	391
[3249]	392	# Add Date (yyyymmdd) metadata
	393	if (defined ($metadata{'year'}) ) {
	394	my $date=$metadata{'year'};
	395	chomp $date;
	396	my $month=$metadata{'month'};
	397	if (defined($month)) {
	398	# month is currently 3 letter code or a range...
	399	$month = expand_month($month);
	400	# take the first month found... might not find one!
	401	$month =~ m/_textmonth(\d\d)_/;
	402	$month = $1;
	403	}
	404	if (!defined($month)) {
	405	$month="00";
	406	}
	407	$date .= "${month}00";
	408	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	409	}
	410
	411	# # Add the text in BibTex format (all fields)
[1676]	412	if ($text =~ /\w/) {
[3249]	413
	414	$text =~ s@&@&@g;
	415	$text =~ s@<@<@g;
	416	$text =~ s@>@>@g;
	417	$text =~ s@\n@<br/>\n@g;
	418	$text =~ s@\\@\\\\@g;
	419
[2901]	420	$doc_obj->add_utf8_text ($cursection, $text);
[1676]	421	}
	422
[1677]	423	return 1;
[1676]	424	}
	425
[1677]	426
[2901]	427
	428
	429	# convert email addresses and URLs into links
	430	sub convert_urls_into_links{
	431	my ($text) = @_;
	432
	433	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]	434	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]	435
	436	return $text;
	437	}
	438
	439	# Clean up whitespace and convert \n charaters to <BR> or <P>
	440	sub clean_up_whitespaces{
	441	my ($text) = @_;
	442
	443	$text =~ s/%%%%%/<BR> <BR>/g;
	444	$text =~ s/ +/ /g;
	445	$text =~ s/\s*$//;
	446	$text =~ s/^\s*//;
	447	$text =~ s/\n/\n<BR>/g;
	448	$text =~ s/<BR>\s*<BR>/<P>/g;
	449
	450	return $text;
	451	}
	452
	453
	454	sub convert_problem_characters_without_ampersand{
	455	my ($text) = @_;
	456	$text =~ s/</</g;
	457	$text =~ s/>/>/g;
	458
	459	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	460	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	461
	462	$text =~ s/\"/"/g;
	463	$text =~ s/\'/’/g;
	464	$text =~ s/\`/‘/g;
	465
[3249]	466	# $text =~ s/\+/ /g;
	467	# $text =~ s/\(/ /g;
	468	# $text =~ s/\)/ /g;
	469
[2901]	470	$text =~ s/\\/\\\\/g;
	471
[3249]	472	# $text =~ s/\./\\\./g;
[2901]	473
	474	return $text;
	475	}
	476
[1676]	477	# Convert a text string into HTML.
	478
[1677]	479	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	480	# careful not to use symbols like ">", which occurs frequently in email
[1677]	481	# messages (and use &gt instead.
	482
	483	# This function also turns URLs and email addresses into links, and
	484	# replaces carriage returns with <BR> tags (and multiple carriage returns
	485	# with <P> tags).
	486
[1676]	487	sub text_into_html {
	488	my ($text) = @_;
	489
[2901]	490	# Convert problem characters into HTML symbols
[1676]	491	$text =~ s/&/&/g;
	492
[2901]	493	$text = &convert_problem_characters_without_ampersand( $text );
	494
[1676]	495	# convert email addresses and URLs into links
[2901]	496	$text = &convert_urls_into_links( $text );
[1676]	497
[2901]	498	$text = &clean_up_whitespaces( $text );
[1676]	499
	500	return $text;
	501	}
	502
[2901]	503
[3142]	504	sub expand_month {
	505	my $text=shift;
[2901]	506
[3142]	507	# bibtex style files expand abbreviations for months.
	508	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	509	$text =~ s/jan/_textmonth01_/g;
	510	$text =~ s/feb/_textmonth02_/g;
	511	$text =~ s/mar/_textmonth03_/g;
	512	$text =~ s/apr/_textmonth04_/g;
	513	$text =~ s/may/_textmonth05_/g;
	514	$text =~ s/jun/_textmonth06_/g;
	515	$text =~ s/jul/_textmonth07_/g;
	516	$text =~ s/aug/_textmonth08_/g;
	517	$text =~ s/sep/_textmonth09_/g;
	518	$text =~ s/oct/_textmonth10_/g;
	519	$text =~ s/nov/_textmonth11_/g;
	520	$text =~ s/dec/_textmonth12_/g;
[2901]	521
[3142]	522	return $text;
	523	}
	524
	525
[2901]	526	# Convert accented characters, remove { }, interprete some commands....
	527	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	528	sub process_latex {
	529	my ($text) = @_;
	530
	531	# note - this is really ugly, but it works. There may be a prettier way
	532	# of mapping latex accented chars to utf8, but we just brute force it here.
	533	# Also, this isn't complete - not every single possible accented letter
	534	# is in here yet, but most of the common ones are.
	535
	536	my %utf8_chars =
	537	(
	538	# acutes
	539	'\'a' => chr(0xc3).chr(0xa1),
	540	'\'c' => chr(0xc4).chr(0x87),
	541	'\'e' => chr(0xc3).chr(0xa9),
	542	'\'i' => chr(0xc3).chr(0xad),
	543	'\'l' => chr(0xc3).chr(0xba),
	544	'\'n' => chr(0xc3).chr(0x84),
	545	'\'o' => chr(0xc3).chr(0xb3),
	546	'\'r' => chr(0xc5).chr(0x95),
	547	'\'s' => chr(0xc5).chr(0x9b),
	548	'\'u' => chr(0xc3).chr(0xba),
	549	'\'y' => chr(0xc3).chr(0xbd),
	550	'\'z' => chr(0xc5).chr(0xba),
	551	# graves
	552	'`a' => chr(0xc3).chr(0xa0),
	553	'`A' => chr(0xc3).chr(0x80),
	554	'`e' => chr(0xc3).chr(0xa8),
	555	'`E' => chr(0xc3).chr(0x88),
	556	'`i' => chr(0xc3).chr(0xac),
	557	'`I' => chr(0xc3).chr(0x8c),
	558	'`o' => chr(0xc3).chr(0xb2),
	559	'`O' => chr(0xc3).chr(0x92),
	560	'`u' => chr(0xc3).chr(0xb9),
	561	'`U' => chr(0xc3).chr(0x99),
	562	# circumflex
	563	'^a' => chr(0xc3).chr(0xa2),
	564	'^A' => chr(0xc3).chr(0x82),
	565	'^c' => chr(0xc4).chr(0x89),
	566	'^C' => chr(0xc4).chr(0x88),
	567	'^e' => chr(0xc3).chr(0xaa),
	568	'^E' => chr(0xc3).chr(0x8a),
	569	'^g' => chr(0xc4).chr(0x9d),
	570	'^G' => chr(0xc4).chr(0x9c),
	571	'^h' => chr(0xc4).chr(0xa5),
	572	'^H' => chr(0xc4).chr(0xa4),
	573	'^i' => chr(0xc3).chr(0xae),
	574	'^I' => chr(0xc3).chr(0x8e),
	575	'^j' => chr(0xc4).chr(0xb5),
	576	'^J' => chr(0xc4).chr(0xb4),
	577	'^o' => chr(0xc3).chr(0xb4),
	578	'^O' => chr(0xc3).chr(0x94),
	579	'^s' => chr(0xc5).chr(0x9d),
	580	'^S' => chr(0xc5).chr(0x9c),
	581	'^u' => chr(0xc3).chr(0xa2),
	582	'^U' => chr(0xc3).chr(0xbb),
	583	'^w' => chr(0xc5).chr(0xb5),
	584	'^W' => chr(0xc5).chr(0xb4),
	585	'^y' => chr(0xc5).chr(0xb7),
	586	'^Y' => chr(0xc5).chr(0xb6),
	587
	588	# diaeresis
	589	'"a' => chr(0xc3).chr(0xa4),
	590	'"A' => chr(0xc3).chr(0x84),
	591	'"e' => chr(0xc3).chr(0xab),
	592	'"E' => chr(0xc3).chr(0x8b),
	593	'"\\\\i' => chr(0xc3).chr(0xaf),
	594	'"\\\\I' => chr(0xc3).chr(0x8f),
	595	'"o' => chr(0xc3).chr(0xb6),
	596	'"O' => chr(0xc3).chr(0x96),
	597	'"u' => chr(0xc3).chr(0xbc),
	598	'"U' => chr(0xc3).chr(0x9c),
	599	'"y' => chr(0xc3).chr(0xbf),
	600	'"Y' => chr(0xc3).chr(0xb8),
	601	# tilde
[3142]	602	'~A' => chr(0xc3).chr(0x83),
	603	'~N' => chr(0xc3).chr(0x91),
	604	'~O' => chr(0xc3).chr(0x95),
	605	'~a' => chr(0xc3).chr(0xa3),
	606	'~n' => chr(0xc3).chr(0xb1),
	607	'~o' => chr(0xc3).chr(0xb5),
[2901]	608	# caron - handled specially
	609	# double acute
	610	# ring
	611	# dot
[3249]	612	'.c' => chr(0xc4).chr(0x8b),
	613	'.C' => chr(0xc4).chr(0x8a),
	614	'.e' => chr(0xc4).chr(0x97),
	615	'.E' => chr(0xc4).chr(0x96),
	616	'.g' => chr(0xc4).chr(0xa1),
	617	'.G' => chr(0xc4).chr(0xa0),
	618	'.I' => chr(0xc4).chr(0xb0),
	619	'.z' => chr(0xc5).chr(0xbc),
	620	'.Z' => chr(0xc5).chr(0xbb),
[2901]	621	# macron
	622	'=a' => chr(0xc4).chr(0x81),
	623	'=A' => chr(0xc4).chr(0x80),
	624	'=e' => chr(0xc4).chr(0x93),
	625	'=E' => chr(0xc4).chr(0x92),
	626	'=i' => chr(0xc4).chr(0xab),
	627	'=I' => chr(0xc4).chr(0xaa),
	628	'=o' => chr(0xc4).chr(0x8d),
	629	'=O' => chr(0xc4).chr(0x8c),
	630	'=u' => chr(0xc4).chr(0xab),
	631	'=U' => chr(0xc4).chr(0xaa),
	632
	633	# stroke - handled specially - see below
	634
	635	# cedilla - handled specially
	636	);
	637
	638	# these are one letter latex commands - we make sure they're not a longer
	639	# command name. eg {\d} is d+stroke, so careful of \d
	640	my %special_utf8_chars =
	641	(
[3156]	642	# breve
	643	'u g' => chr(0xc4).chr(0x9f),
	644	'u G' => chr(0xc4).chr(0x9e),
	645	'u i' => chr(0xc4).chr(0xad),
	646	'u I' => chr(0xc4).chr(0xac),
	647	'u o' => chr(0xc5).chr(0x8f),
	648	'u O' => chr(0xc5).chr(0x8e),
	649	'u u' => chr(0xc5).chr(0xad),
	650	'u U' => chr(0xc5).chr(0xac),
[2901]	651	# caron
[3156]	652	'v c' => chr(0xc4).chr(0x8d),
	653	'v C' => chr(0xc4).chr(0x8c),
[2901]	654	'v n' => chr(0xc5).chr(0x88),
	655	'v N' => chr(0xc5).chr(0x87),
	656	'v s' => chr(0xc5).chr(0xa1),
	657	'v S' => chr(0xc5).chr(0xa5),
[3156]	658	'v z' => chr(0xc5).chr(0xbe),
	659	'v Z' => chr(0xc5).chr(0xbd),
[2901]	660	# cedilla
	661	'c c' => chr(0xc3).chr(0xa7),
	662	'c C' => chr(0xc3).chr(0x87),
	663	'c g' => chr(0xc4).chr(0xa3),
	664	'c G' => chr(0xc4).chr(0xa2),
	665	'c k' => chr(0xc4).chr(0xb7),
	666	'c K' => chr(0xc4).chr(0xb6),
	667	'c l' => chr(0xc4).chr(0xbc),
	668	'c L' => chr(0xc4).chr(0xbb),
	669	'c n' => chr(0xc5).chr(0x86),
	670	'c N' => chr(0xc5).chr(0x85),
	671	'c r' => chr(0xc5).chr(0x97),
	672	'c R' => chr(0xc5).chr(0x96),
	673	'c s' => chr(0xc5).chr(0x9f),
	674	'c S' => chr(0xc5).chr(0x9e),
	675	'c t' => chr(0xc5).chr(0xa3),
	676	'c T' => chr(0xc5).chr(0xa2),
	677	# double acute / Hungarian accent
	678	'H O' => chr(0xc5).chr(0x90),
	679	'H o' => chr(0xc5).chr(0x91),
	680	'H U' => chr(0xc5).chr(0xb0),
	681	'H u' => chr(0xc5).chr(0xb1),
	682
	683	# stroke
	684	'd' => chr(0xc4).chr(0x91),
	685	'D' => chr(0xc4).chr(0x90),
	686	'h' => chr(0xc4).chr(0xa7),
	687	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	688	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	689	'l' => chr(0xc5).chr(0x82),
	690	'L' => chr(0xc5).chr(0x81),
	691	'o' => chr(0xc3).chr(0xb8),
	692	'O' => chr(0xc3).chr(0x98),
	693	't' => chr(0xc5).chr(0xa7),
	694	'T' => chr(0xc5).chr(0xa6),
	695	# german ss/szlig/sharp s
	696	'ss' => chr(0xc3).chr(0x9f),
	697	);
	698
	699	# convert latex-style accented characters.
[3249]	700
[2901]	701	# remove space (if any) between \ and letter to accent (eg {\' a})
	702
[13413]	703	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
	704
[2901]	705	# remove {} around a single character (eg \'{e})
[32129]	706	$text =~ s@(\\[`'="^~\.])\{(\w)}@{$1$2}@g; #`
[2901]	707
[3249]	708	# \, is another way of doing cedilla \c
	709	$text =~ s@\\,(.)@\\c $1@g;
	710
[2901]	711	# remove {} around a single character for special 1 letter commands -
	712	# need to insert a space. Eg \v{s} -> {\v s}
[32129]	713	$text =~ s@(\\[uvcH])\{(\w)}@{$1 $2}@g;
[3249]	714
[2901]	715	# only do if the text contains a '\' character.
	716	if ($text =~ m\|\\\|) {
[3249]	717	# "normal" accents - ie non-alpha latex tag
[13413]	718	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]	719	my $tex="$1$2"; my $char="$2";
	720	my $replacement=$utf8_chars{$tex};
	721	if (!defined($replacement)) {
[15872]	722	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	723	$replacement=$char;
	724	}
[5454]	725	$text =~ s/\\\Q$tex/$replacement/g;
[3249]	726	}
[2901]	727
[3249]	728	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	729	# only do the change if immediately followed by a space, }, {, or \
	730	# one letter accents ( + ss)
	731	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
	732	my $tex=$1;
	733	my $replacement=$special_utf8_chars{$tex};
	734	if (!defined($replacement)) {
[15872]	735	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	736	$replacement=$tex;
	737	}
	738	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	739
	740	}
	741
	742	# one letter latex accent commands that affect following letter
	743	while ($text =~ m@\\([uvcH]) ([\w])@) {
	744	my $tex="$1 $2"; my $char="$2";
	745	my $replacement=$special_utf8_chars{$tex};
	746	if (!defined($replacement)) {
[15872]	747	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	748	$replacement=$char;
	749	}
	750	$text =~ s/\\$tex/$replacement/g;
	751	}
[2901]	752	}
[3249]	753
	754	# escape html-sensitive characters
	755	$text =~ s@&@&@g;
	756	$text =~ s@<@<@g;
	757	$text =~ s@>@>@g;
	758	$text =~ s/''/"/g; # Latex-specific
	759	$text =~ s/``/"/g; # Latex-specific
	760	# greenstone-specific
	761	$text =~ s@\[@&\#91;@g;
	762	$text =~ s@\]@&\#93;@g;
	763
[3112]	764	# remove latex commands
[3249]	765
	766	# explicitly recognised commands
	767	$text =~ s@\\ldots@…@g;
	768
	769	# maths mode
	770	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
	771
	772	# remove all other commands with optional arguments...
[3156]	773	$text =~ s@\\\w+(\[.?\])?\s@@g;
	774	# $text =~ s@\\noopsort{[^}]+\}@@g;
	775	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
[3142]	776
[2901]	777	# remove latex groupings { } (but not \{ or \} )
[3249]	778	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]	779	$text =~ s/^\{//; # remove { if first char
	780
	781	# latex characters
	782	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]	783	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]	784	# optional break "\-"
[3249]	785	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]	786	# the non-macro bits have quotes around them - we just remove quotes
[3249]	787	$text =~ s/([^&])[\"\#]/$1/g;
[3142]	788	}
[3249]	789	# dashes. Convert (m\|n)-dash into single dash for html.
	790	$text =~ s@\-\-+@\-@g;
	791
[2901]	792	# quoted { } chars
[31780]	793	$text =~ s@\\\{@{@g;
[2901]	794	$text =~ s@\\}@}@g;
	795
[3249]	796	# finally to protect against macro language...
	797	$text =~ s@\\@\\\\@g;
	798
[2901]	799	return $text;
	800	}
	801
	802
[3249]	803	sub process_latex_math {
	804	my $text = shift;
	805
	806	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
[31780]	807	$text =~ s@\^\{(.*?)}@<sup>$1</sup>@g; # superscript
[3249]	808	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
[31780]	809	$text =~ s@\_\{(.*?)}@<sub>$1</sub>@g; # subscript
[3249]	810	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
	811
	812	# put all other command names in italics
	813	$text =~ s@\\([\w]+)@<i>$1</i>@g;
	814
	815	return $text;
	816	}
	817
[17026]	818	sub add_OID {
[2484]	819	my $self = shift (@_);
	820	my ($doc_obj, $id, $segment_number) = @_;
	821
	822	if ( $self->{'key'} eq "default") {
[17026]	823	$self->SUPER::add_OID(@_);
	824	# $doc_obj->set_OID("$id\_$segment_number");
[2484]	825	} else {
	826	$doc_obj->set_OID($self->{'key'});
	827	}
	828	}
[1676]	829
[2484]	830	1;
[4792]	831
	832

Note: See TracBrowser for help on using the repository browser.

Download in other formats: