Context Navigation

source: gsdl/trunk/perllib/plugins/BibTexPlugin.pm@ 17055

Last change on this file since 17055 was 17026, checked in by kjdon, 16 years ago
OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis...)
Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
[15872]	3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
[1676]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
[15872]	29	# BibTexPlugin reads bibliography files in BibTex format.
[1676]	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
[15872]	34	# BibTexPlugin creates a document object for every reference a the file.
	35	# It is a subclass of SplitTextFile, so if there are multiple records, all
[1676]	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
[1676]	42
[3587]	43
[15872]	44	package BibTexPlugin;
[1676]	45
[15872]	46	use SplitTextFile;
[10254]	47	use strict;
	48	no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]	49
[15872]	50	# BibTexPlugin is a sub-class of SplitTextFile.
[1676]	51	sub BEGIN {
[15872]	52	@BibTexPlugin::ISA = ('SplitTextFile');
[1676]	53	}
	54
[3540]	55	my $arguments =
[4744]	56	[ { 'name' => "process_exp",
[15872]	57	'desc' => "{BasePlugin.process_exp}",
[6408]	58	'type' => "regexp",
[4873]	59	'reqd' => "no",
[6408]	60	'deft' => &get_default_process_exp() },
	61	{ 'name' => "split_exp",
[15872]	62	'desc' => "{SplitTextFile.split_exp}",
[6408]	63	'type' => "regexp",
	64	'deft' => &get_default_split_exp(),
	65	'reqd' => "no" }
	66	];
[3540]	67
[15872]	68	my $options = { 'name' => "BibTexPlugin",
	69	'desc' => "{BibTexPlugin.desc}",
[6408]	70	'abstract' => "no",
	71	'inherits' => "yes",
[9120]	72	'explodes' => "yes",
[4744]	73	'args' => $arguments };
[3540]	74
[1676]	75	# This plugin processes files with the suffix ".bib"
	76	sub get_default_process_exp {
	77	return q^(?i)\.bib$^;
	78	}
	79
	80	# This plugin splits the input text at blank lines
	81	sub get_default_split_exp {
	82	return q^\n+(?=@)^;
	83	}
[15872]	84
[3540]	85	sub new {
[10218]	86	my ($class) = shift (@_);
	87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	88	push(@$pluginlist, $class);
	89
[15872]	90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	91	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	92
[15872]	93	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	94
[3540]	95	return bless $self, $class;
	96	}
[1676]	97
[2901]	98
	99
[1676]	100	# The process function reads a single bibliographic record and stores
	101	# it as a new document.
	102
	103	sub process {
	104	my $self = shift (@_);
[6332]	105	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]	106	my $outhandle = $self->{'outhandle'};
	107
[10254]	108	my $cursection = $doc_obj->get_top_section();
[2484]	109	$self->{'key'} = "default";
	110
[1676]	111	# Check that we're dealing with a valid BibTex record
	112	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	113
	114	# Ignore things we can't use
	115	return 0 if ($$textref =~ /^\@String/);
	116
[1711]	117	# This hash translates BibTex field names into metadata names. The
[1676]	118	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	119	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	120
[9582]	121	# The author metadata will be stored as one "Creator" entry, but will
	122	# also be split up into several individual "Author" fields.
	123
[1676]	124	my %field = (
	125	'address', 'PublisherAddress',
	126	'author', 'Creator',
[2901]	127
[1676]	128	'booktitle', 'Booktitle',
	129	'chapter', 'Chapter',
	130	'edition', 'Edition',
[2901]	131	'editor', 'Editor',
[1676]	132	'institution', 'Publisher',
	133	'journal', 'Journal',
	134	'month', 'Month',
	135	'number', 'Number',
	136	'pages', 'Pages',
	137	'publisher', 'Publisher',
	138	'school', 'Publisher',
	139	'title', 'Title',
	140	'volume', 'Volume',
[3142]	141	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]	142
	143	'keywords', 'Keywords',
	144	'abstract', 'Abstract',
[4792]	145	'copyright', 'Copyright',
	146	'note', 'Note',
[11527]	147	'url', 'URL',
[8121]	148	);
[1676]	149
	150	# Metadata fields
	151	my %metadata;
[4792]	152	my ($EntryType, $Creator, $Keywords, $text);
[1676]	153
[2901]	154	my $verbosity = $self->{'verbosity'};
	155	$verbosity = 0 unless $verbosity;
	156
[3142]	157	# Make sure the text has exactly one entry per line -
	158	# append line to previous if it doesn't start with " <key> = "
[2901]	159
[4792]	160	my @input_lines=split(/\r?\n/, $$textref);
[3142]	161	my @all_lines;
	162	my $entry_line=shift @input_lines;
	163	foreach my $input_line (@input_lines) {
	164	if ($input_line =~ m/^\s\w+\s=\s*/) {
	165	# this is a new key
	166	push(@all_lines, $entry_line);
	167	$entry_line=$input_line;
	168	} else {
	169	# this is a continuation of previous line
[3249]	170	$entry_line .= " " . $input_line;
[3142]	171	}
	172
	173	}
	174	# add final line, removing trailing '}'
	175	$entry_line =~ s/\}\s*$//;
	176	push(@all_lines, $entry_line);
	177	push(@all_lines, "}");
[2901]	178
[1676]	179	# Read and process each line in the bib file.
[2901]	180	my ($entryname, $name, $value, $line);
	181	foreach $line (@all_lines) {
	182
[1676]	183	# Add each line. Most lines consist of a field identifer and
	184	# then data, and we simply store them, though we treat some
	185	# of the fields a bit differently.
	186
	187	$line =~ s/\s+/ /g;
	188	$text .= "$line\n";
	189
[4792]	190
[2901]	191	print "Processing line = $line \n" if $verbosity>=4;
	192
[1676]	193	# The first line is special, it contains the reference type and OID
[4792]	194	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
[1676]	195	$EntryType = $1;
[4792]	196	my $EntryID = (defined $2) ? $2 : "default";
[2484]	197	print "** $EntryType - \"$EntryID\" \n"
	198	if ($verbosity >= 4);
[4792]	199
[1676]	200	next;
	201	}
[2484]	202	if ($line =~ /\@/) {
[2901]	203	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	204	if ($verbosity >= 2);
[2901]	205	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	206	if ($verbosity >= 2);
	207	}
[1676]	208
	209	# otherwise, parse the metadata out of this line
[9582]	210	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
[2901]	211	$entryname = lc($1);
[1676]	212	$value = $2;
[3112]	213	$value =~ s/,?\s*$//; # remove trailing comma and space
	214	if ($value =~ /^"/ && $value =~ /"$/) {
	215	# remove surrounding " marks
	216	$value =~ s/^"//; $value =~ s/"$//;
[13416]	217	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
	218	# remove surrounding {} marks
	219	$value =~ s/^\{//; $value =~ s/\}$//;
[3112]	220	}
[13413]	221	# special case for year - we only want a 4 digit year, not
	222	# "circa 2004" etc
	223	if ($entryname=~ /^year$/i) {
	224	if ($value=~ /(\d{4})/) {
	225	$value=$1;
	226	$metadata{$entryname} .= "$value";
	227	}
	228	}
	229	else {
	230
	231	$value = &process_latex($value);
	232	# Add this line of metadata
	233	$metadata{$entryname} .= "$value";
	234	}
[1676]	235	}
	236
[1677]	237	# Add the Entry type as metadata
[2901]	238	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]	239
	240	#Add the fileformat as metadata
	241	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]	242
[8121]	243
[1676]	244	# Add the various field as metadata
[2901]	245	foreach my $entryname (keys %metadata) {
	246	next unless (defined $field{$entryname});
	247	next unless (defined $metadata{$entryname});
[1676]	248
[2901]	249	$name = $field{$entryname};
	250	$value = $metadata{$entryname};
[1676]	251
[3142]	252	if ($name =~ /^Month/) {
	253	$value=expand_month($value);
	254	}
[1676]	255
	256	# Several special operatons on metadata follow
	257
	258	# Add individual keywords.
	259	# The full set of keywords will be added, in due course, as "Keywords".
	260	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	261	if ($entryname eq "keywords") {
[1676]	262	my @keywordlist = split(/,/, $value);
	263	foreach my $k (@keywordlist) {
[2901]	264	$k = lc($k);
[1676]	265	$k =~ s/\s*$//;
	266	$k =~ s/^\s*//;
	267	if ($k =~ /\w/) {
[2901]	268	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	269	}
	270	}
	271	}
	272
	273	# Add individual authors
	274	# The author metadata will be stored as one "Creator" entry, but we
	275	# also want to split it into several individual "Author" fields in
[3112]	276	# "Lastname, Firstnames" format so we can browse it.
[2901]	277	if ($entryname eq "author") { #added also comparison with editor
	278
[3249]	279	# take care of "et al."...
[9582]	280	my $etal='';
	281	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
	282	$etal=' <em>et. al.</em>';
	283	}
[2901]	284	# und here for german language...
	285	# don't use brackets in pattern, else the matched bit becomes
	286	# an element in the list!
[3249]	287	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
	288	my @formattedlist = ();
[1676]	289	foreach $a (@authorlist) {
	290	$a =~ s/\s*$//;
	291	$a =~ s/^\s*//;
	292	# Reformat and add author name
[2901]	293	next if $a=~ /^\s*$/;
[3249]	294
	295	# names are "First von Last", "von Last, First"
	296	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	297	my $first="";
	298	my $vonlast="";
	299	my $jr="";
[2901]	300
[3249]	301	if ($a =~ /,/) {
	302	my @parts=split(/,\s*/, $a);
	303	$first = pop @parts;
	304	if (scalar(@parts) == 2) {
	305	$jr = pop @parts;
	306	}
	307	$vonlast=shift @parts;
	308	if (scalar(@parts) > 0) {
[15872]	309	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
[3249]	310	# but we continue anyway...
	311	}
	312	} else { # First von Last
	313	my @words = split(/ /, $a);
	314	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
	315	$first .= " " . shift (@words);
	316	}
	317	$first =~ s/^\s//;
	318	$vonlast = join (' ', @words); # whatever's left...
	319	}
	320	my $von="";
	321	my $last="";
	322	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	323	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	324	$von = $1;
	325	if (!defined ($von)) {
	326	# some non-English names do start with lowercase
	327	# eg "Marie desJardins". Also we can get typos...
[15872]	328	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
[3249]	329	$von="";
	330	if ($vonlast =~ /^[a-z]+$/) {
	331	# if it's all lowercase, uppercase 1st.
	332	$vonlast =~ s/^(.)/\u$1/;
[2901]	333
[3249]	334	}
	335	}
	336	$von =~ s/\s*$//;
	337	$last=$vonlast;
	338	} else {
	339	$last=$vonlast;
	340	}
	341	my $wholename="$first $von $last $jr";
[9582]	342	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
	343	$wholename =~ s/ $//;
[3249]	344	push (@formattedlist, $wholename);
	345	my $fullname = "$last";
	346	$fullname .= " $jr" if ($jr);
	347	$fullname .= ", $first";
	348	$fullname .= " $von" if ($von);
	349
[1676]	350	# Add each name to set of Authors
[3249]	351	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	352	}
[3249]	353
	354	# Only want at most one "and" in the Creator field
	355	if (scalar(@formattedlist) > 2) {
	356	my $lastauthor=pop @formattedlist;
	357	$value=join(', ', @formattedlist);
	358	$value.=" and $lastauthor";
	359	} else { # 1 or 2 authors...
	360	$value=join(" and ",@formattedlist);
	361	}
	362	$value.=$etal; # if there was "et al."
[1676]	363	}
	364
	365	# Books and Journals are additionally marked for display purposes
[2901]	366	if ($entryname eq "booktitle") {
	367	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	368	} elsif ($entryname eq "journal") {
	369	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	370	}
	371
[3249]	372	# Add the various fields as metadata
	373	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]	374
[1676]	375	}
	376
[9582]	377	# for books and journals...
	378	if (!exists $metadata{'title'}) {
	379	my $name=$field{'title'}; # get Greenstone metadata name
	380	my $value;
	381	if (exists $metadata{'booktitle'}) {
	382	$value=$metadata{'booktitle'};
	383	} elsif (exists $metadata{'journal'}) {
	384	$value=$metadata{'journal'};
	385	}
	386	if ($value) {
	387	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
	388	}
	389	}
	390
[3249]	391	# Add Date (yyyymmdd) metadata
	392	if (defined ($metadata{'year'}) ) {
	393	my $date=$metadata{'year'};
	394	chomp $date;
	395	my $month=$metadata{'month'};
	396	if (defined($month)) {
	397	# month is currently 3 letter code or a range...
	398	$month = expand_month($month);
	399	# take the first month found... might not find one!
	400	$month =~ m/_textmonth(\d\d)_/;
	401	$month = $1;
	402	}
	403	if (!defined($month)) {
	404	$month="00";
	405	}
	406	$date .= "${month}00";
	407	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	408	}
	409
	410	# # Add the text in BibTex format (all fields)
[1676]	411	if ($text =~ /\w/) {
[3249]	412
	413	$text =~ s@&@&@g;
	414	$text =~ s@<@<@g;
	415	$text =~ s@>@>@g;
	416	$text =~ s@\n@<br/>\n@g;
	417	$text =~ s@\\@\\\\@g;
	418
[2901]	419	$doc_obj->add_utf8_text ($cursection, $text);
[1676]	420	}
	421
[1677]	422	return 1;
[1676]	423	}
	424
[1677]	425
[2901]	426
	427
	428	# convert email addresses and URLs into links
	429	sub convert_urls_into_links{
	430	my ($text) = @_;
	431
	432	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]	433	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]	434
	435	return $text;
	436	}
	437
	438	# Clean up whitespace and convert \n charaters to <BR> or <P>
	439	sub clean_up_whitespaces{
	440	my ($text) = @_;
	441
	442	$text =~ s/%%%%%/<BR> <BR>/g;
	443	$text =~ s/ +/ /g;
	444	$text =~ s/\s*$//;
	445	$text =~ s/^\s*//;
	446	$text =~ s/\n/\n<BR>/g;
	447	$text =~ s/<BR>\s*<BR>/<P>/g;
	448
	449	return $text;
	450	}
	451
	452
	453	sub convert_problem_characters_without_ampersand{
	454	my ($text) = @_;
	455	$text =~ s/</</g;
	456	$text =~ s/>/>/g;
	457
	458	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	459	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	460
	461	$text =~ s/\"/"/g;
	462	$text =~ s/\'/’/g;
	463	$text =~ s/\`/‘/g;
	464
[3249]	465	# $text =~ s/\+/ /g;
	466	# $text =~ s/\(/ /g;
	467	# $text =~ s/\)/ /g;
	468
[2901]	469	$text =~ s/\\/\\\\/g;
	470
[3249]	471	# $text =~ s/\./\\\./g;
[2901]	472
	473	return $text;
	474	}
	475
[1676]	476	# Convert a text string into HTML.
	477
[1677]	478	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	479	# careful not to use symbols like ">", which occurs frequently in email
[1677]	480	# messages (and use &gt instead.
	481
	482	# This function also turns URLs and email addresses into links, and
	483	# replaces carriage returns with <BR> tags (and multiple carriage returns
	484	# with <P> tags).
	485
[1676]	486	sub text_into_html {
	487	my ($text) = @_;
	488
[2901]	489	# Convert problem characters into HTML symbols
[1676]	490	$text =~ s/&/&/g;
	491
[2901]	492	$text = &convert_problem_characters_without_ampersand( $text );
	493
[1676]	494	# convert email addresses and URLs into links
[2901]	495	$text = &convert_urls_into_links( $text );
[1676]	496
[2901]	497	$text = &clean_up_whitespaces( $text );
[1676]	498
	499	return $text;
	500	}
	501
[2901]	502
[3142]	503	sub expand_month {
	504	my $text=shift;
[2901]	505
[3142]	506	# bibtex style files expand abbreviations for months.
	507	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	508	$text =~ s/jan/_textmonth01_/g;
	509	$text =~ s/feb/_textmonth02_/g;
	510	$text =~ s/mar/_textmonth03_/g;
	511	$text =~ s/apr/_textmonth04_/g;
	512	$text =~ s/may/_textmonth05_/g;
	513	$text =~ s/jun/_textmonth06_/g;
	514	$text =~ s/jul/_textmonth07_/g;
	515	$text =~ s/aug/_textmonth08_/g;
	516	$text =~ s/sep/_textmonth09_/g;
	517	$text =~ s/oct/_textmonth10_/g;
	518	$text =~ s/nov/_textmonth11_/g;
	519	$text =~ s/dec/_textmonth12_/g;
[2901]	520
[3142]	521	return $text;
	522	}
	523
	524
[2901]	525	# Convert accented characters, remove { }, interprete some commands....
	526	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	527	sub process_latex {
	528	my ($text) = @_;
	529
	530	# note - this is really ugly, but it works. There may be a prettier way
	531	# of mapping latex accented chars to utf8, but we just brute force it here.
	532	# Also, this isn't complete - not every single possible accented letter
	533	# is in here yet, but most of the common ones are.
	534
	535	my %utf8_chars =
	536	(
	537	# acutes
	538	'\'a' => chr(0xc3).chr(0xa1),
	539	'\'c' => chr(0xc4).chr(0x87),
	540	'\'e' => chr(0xc3).chr(0xa9),
	541	'\'i' => chr(0xc3).chr(0xad),
	542	'\'l' => chr(0xc3).chr(0xba),
	543	'\'n' => chr(0xc3).chr(0x84),
	544	'\'o' => chr(0xc3).chr(0xb3),
	545	'\'r' => chr(0xc5).chr(0x95),
	546	'\'s' => chr(0xc5).chr(0x9b),
	547	'\'u' => chr(0xc3).chr(0xba),
	548	'\'y' => chr(0xc3).chr(0xbd),
	549	'\'z' => chr(0xc5).chr(0xba),
	550	# graves
	551	'`a' => chr(0xc3).chr(0xa0),
	552	'`A' => chr(0xc3).chr(0x80),
	553	'`e' => chr(0xc3).chr(0xa8),
	554	'`E' => chr(0xc3).chr(0x88),
	555	'`i' => chr(0xc3).chr(0xac),
	556	'`I' => chr(0xc3).chr(0x8c),
	557	'`o' => chr(0xc3).chr(0xb2),
	558	'`O' => chr(0xc3).chr(0x92),
	559	'`u' => chr(0xc3).chr(0xb9),
	560	'`U' => chr(0xc3).chr(0x99),
	561	# circumflex
	562	'^a' => chr(0xc3).chr(0xa2),
	563	'^A' => chr(0xc3).chr(0x82),
	564	'^c' => chr(0xc4).chr(0x89),
	565	'^C' => chr(0xc4).chr(0x88),
	566	'^e' => chr(0xc3).chr(0xaa),
	567	'^E' => chr(0xc3).chr(0x8a),
	568	'^g' => chr(0xc4).chr(0x9d),
	569	'^G' => chr(0xc4).chr(0x9c),
	570	'^h' => chr(0xc4).chr(0xa5),
	571	'^H' => chr(0xc4).chr(0xa4),
	572	'^i' => chr(0xc3).chr(0xae),
	573	'^I' => chr(0xc3).chr(0x8e),
	574	'^j' => chr(0xc4).chr(0xb5),
	575	'^J' => chr(0xc4).chr(0xb4),
	576	'^o' => chr(0xc3).chr(0xb4),
	577	'^O' => chr(0xc3).chr(0x94),
	578	'^s' => chr(0xc5).chr(0x9d),
	579	'^S' => chr(0xc5).chr(0x9c),
	580	'^u' => chr(0xc3).chr(0xa2),
	581	'^U' => chr(0xc3).chr(0xbb),
	582	'^w' => chr(0xc5).chr(0xb5),
	583	'^W' => chr(0xc5).chr(0xb4),
	584	'^y' => chr(0xc5).chr(0xb7),
	585	'^Y' => chr(0xc5).chr(0xb6),
	586
	587	# diaeresis
	588	'"a' => chr(0xc3).chr(0xa4),
	589	'"A' => chr(0xc3).chr(0x84),
	590	'"e' => chr(0xc3).chr(0xab),
	591	'"E' => chr(0xc3).chr(0x8b),
	592	'"\\\\i' => chr(0xc3).chr(0xaf),
	593	'"\\\\I' => chr(0xc3).chr(0x8f),
	594	'"o' => chr(0xc3).chr(0xb6),
	595	'"O' => chr(0xc3).chr(0x96),
	596	'"u' => chr(0xc3).chr(0xbc),
	597	'"U' => chr(0xc3).chr(0x9c),
	598	'"y' => chr(0xc3).chr(0xbf),
	599	'"Y' => chr(0xc3).chr(0xb8),
	600	# tilde
[3142]	601	'~A' => chr(0xc3).chr(0x83),
	602	'~N' => chr(0xc3).chr(0x91),
	603	'~O' => chr(0xc3).chr(0x95),
	604	'~a' => chr(0xc3).chr(0xa3),
	605	'~n' => chr(0xc3).chr(0xb1),
	606	'~o' => chr(0xc3).chr(0xb5),
[2901]	607	# caron - handled specially
	608	# double acute
	609	# ring
	610	# dot
[3249]	611	'.c' => chr(0xc4).chr(0x8b),
	612	'.C' => chr(0xc4).chr(0x8a),
	613	'.e' => chr(0xc4).chr(0x97),
	614	'.E' => chr(0xc4).chr(0x96),
	615	'.g' => chr(0xc4).chr(0xa1),
	616	'.G' => chr(0xc4).chr(0xa0),
	617	'.I' => chr(0xc4).chr(0xb0),
	618	'.z' => chr(0xc5).chr(0xbc),
	619	'.Z' => chr(0xc5).chr(0xbb),
[2901]	620	# macron
	621	'=a' => chr(0xc4).chr(0x81),
	622	'=A' => chr(0xc4).chr(0x80),
	623	'=e' => chr(0xc4).chr(0x93),
	624	'=E' => chr(0xc4).chr(0x92),
	625	'=i' => chr(0xc4).chr(0xab),
	626	'=I' => chr(0xc4).chr(0xaa),
	627	'=o' => chr(0xc4).chr(0x8d),
	628	'=O' => chr(0xc4).chr(0x8c),
	629	'=u' => chr(0xc4).chr(0xab),
	630	'=U' => chr(0xc4).chr(0xaa),
	631
	632	# stroke - handled specially - see below
	633
	634	# cedilla - handled specially
	635	);
	636
	637	# these are one letter latex commands - we make sure they're not a longer
	638	# command name. eg {\d} is d+stroke, so careful of \d
	639	my %special_utf8_chars =
	640	(
[3156]	641	# breve
	642	'u g' => chr(0xc4).chr(0x9f),
	643	'u G' => chr(0xc4).chr(0x9e),
	644	'u i' => chr(0xc4).chr(0xad),
	645	'u I' => chr(0xc4).chr(0xac),
	646	'u o' => chr(0xc5).chr(0x8f),
	647	'u O' => chr(0xc5).chr(0x8e),
	648	'u u' => chr(0xc5).chr(0xad),
	649	'u U' => chr(0xc5).chr(0xac),
[2901]	650	# caron
[3156]	651	'v c' => chr(0xc4).chr(0x8d),
	652	'v C' => chr(0xc4).chr(0x8c),
[2901]	653	'v n' => chr(0xc5).chr(0x88),
	654	'v N' => chr(0xc5).chr(0x87),
	655	'v s' => chr(0xc5).chr(0xa1),
	656	'v S' => chr(0xc5).chr(0xa5),
[3156]	657	'v z' => chr(0xc5).chr(0xbe),
	658	'v Z' => chr(0xc5).chr(0xbd),
[2901]	659	# cedilla
	660	'c c' => chr(0xc3).chr(0xa7),
	661	'c C' => chr(0xc3).chr(0x87),
	662	'c g' => chr(0xc4).chr(0xa3),
	663	'c G' => chr(0xc4).chr(0xa2),
	664	'c k' => chr(0xc4).chr(0xb7),
	665	'c K' => chr(0xc4).chr(0xb6),
	666	'c l' => chr(0xc4).chr(0xbc),
	667	'c L' => chr(0xc4).chr(0xbb),
	668	'c n' => chr(0xc5).chr(0x86),
	669	'c N' => chr(0xc5).chr(0x85),
	670	'c r' => chr(0xc5).chr(0x97),
	671	'c R' => chr(0xc5).chr(0x96),
	672	'c s' => chr(0xc5).chr(0x9f),
	673	'c S' => chr(0xc5).chr(0x9e),
	674	'c t' => chr(0xc5).chr(0xa3),
	675	'c T' => chr(0xc5).chr(0xa2),
	676	# double acute / Hungarian accent
	677	'H O' => chr(0xc5).chr(0x90),
	678	'H o' => chr(0xc5).chr(0x91),
	679	'H U' => chr(0xc5).chr(0xb0),
	680	'H u' => chr(0xc5).chr(0xb1),
	681
	682	# stroke
	683	'd' => chr(0xc4).chr(0x91),
	684	'D' => chr(0xc4).chr(0x90),
	685	'h' => chr(0xc4).chr(0xa7),
	686	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	687	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	688	'l' => chr(0xc5).chr(0x82),
	689	'L' => chr(0xc5).chr(0x81),
	690	'o' => chr(0xc3).chr(0xb8),
	691	'O' => chr(0xc3).chr(0x98),
	692	't' => chr(0xc5).chr(0xa7),
	693	'T' => chr(0xc5).chr(0xa6),
	694	# german ss/szlig/sharp s
	695	'ss' => chr(0xc3).chr(0x9f),
	696	);
	697
	698	# convert latex-style accented characters.
[3249]	699
[2901]	700	# remove space (if any) between \ and letter to accent (eg {\' a})
	701
[13413]	702	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
	703
[2901]	704	# remove {} around a single character (eg \'{e})
[13413]	705	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]	706
[3249]	707	# \, is another way of doing cedilla \c
	708	$text =~ s@\\,(.)@\\c $1@g;
	709
[2901]	710	# remove {} around a single character for special 1 letter commands -
	711	# need to insert a space. Eg \v{s} -> {\v s}
[3156]	712	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]	713
[2901]	714	# only do if the text contains a '\' character.
	715	if ($text =~ m\|\\\|) {
[3249]	716	# "normal" accents - ie non-alpha latex tag
[13413]	717	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]	718	my $tex="$1$2"; my $char="$2";
	719	my $replacement=$utf8_chars{$tex};
	720	if (!defined($replacement)) {
[15872]	721	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	722	$replacement=$char;
	723	}
[5454]	724	$text =~ s/\\\Q$tex/$replacement/g;
[3249]	725	}
[2901]	726
[3249]	727	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	728	# only do the change if immediately followed by a space, }, {, or \
	729	# one letter accents ( + ss)
	730	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
	731	my $tex=$1;
	732	my $replacement=$special_utf8_chars{$tex};
	733	if (!defined($replacement)) {
[15872]	734	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	735	$replacement=$tex;
	736	}
	737	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	738
	739	}
	740
	741	# one letter latex accent commands that affect following letter
	742	while ($text =~ m@\\([uvcH]) ([\w])@) {
	743	my $tex="$1 $2"; my $char="$2";
	744	my $replacement=$special_utf8_chars{$tex};
	745	if (!defined($replacement)) {
[15872]	746	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	747	$replacement=$char;
	748	}
	749	$text =~ s/\\$tex/$replacement/g;
	750	}
[2901]	751	}
[3249]	752
	753	# escape html-sensitive characters
	754	$text =~ s@&@&@g;
	755	$text =~ s@<@<@g;
	756	$text =~ s@>@>@g;
	757	$text =~ s/''/"/g; # Latex-specific
	758	$text =~ s/``/"/g; # Latex-specific
	759	# greenstone-specific
	760	$text =~ s@\[@&\#91;@g;
	761	$text =~ s@\]@&\#93;@g;
	762
[3112]	763	# remove latex commands
[3249]	764
	765	# explicitly recognised commands
	766	$text =~ s@\\ldots@…@g;
	767
	768	# maths mode
	769	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
	770
	771	# remove all other commands with optional arguments...
[3156]	772	$text =~ s@\\\w+(\[.?\])?\s@@g;
	773	# $text =~ s@\\noopsort{[^}]+\}@@g;
	774	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
[3142]	775
[2901]	776	# remove latex groupings { } (but not \{ or \} )
[3249]	777	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]	778	$text =~ s/^\{//; # remove { if first char
	779
	780	# latex characters
	781	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]	782	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]	783	# optional break "\-"
[3249]	784	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]	785	# the non-macro bits have quotes around them - we just remove quotes
[3249]	786	$text =~ s/([^&])[\"\#]/$1/g;
[3142]	787	}
[3249]	788	# dashes. Convert (m\|n)-dash into single dash for html.
	789	$text =~ s@\-\-+@\-@g;
	790
[2901]	791	# quoted { } chars
	792	$text =~ s@\\{@{@g;
	793	$text =~ s@\\}@}@g;
	794
[3249]	795	# finally to protect against macro language...
	796	$text =~ s@\\@\\\\@g;
	797
[2901]	798	return $text;
	799	}
	800
	801
[3249]	802	sub process_latex_math {
	803	my $text = shift;
	804
	805	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
	806	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
	807	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
	808	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
	809	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
	810
	811	# put all other command names in italics
	812	$text =~ s@\\([\w]+)@<i>$1</i>@g;
	813
	814	return $text;
	815	}
	816
[17026]	817	sub add_OID {
[2484]	818	my $self = shift (@_);
	819	my ($doc_obj, $id, $segment_number) = @_;
	820
	821	if ( $self->{'key'} eq "default") {
[17026]	822	$self->SUPER::add_OID(@_);
	823	# $doc_obj->set_OID("$id\_$segment_number");
[2484]	824	} else {
	825	$doc_obj->set_OID($self->{'key'});
	826	}
	827	}
[1676]	828
[2484]	829	1;
[4792]	830
	831

Note: See TracBrowser for help on using the repository browser.

Download in other formats: