Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BibTexPlugin.pm@ 29745

Last change on this file since 29745 was 24548, checked in by ak19, 13 years ago
Part 2 of previous commit (r24547). Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
[15872]	3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
[1676]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
[15872]	29	# BibTexPlugin reads bibliography files in BibTex format.
[1676]	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
[15872]	34	# BibTexPlugin creates a document object for every reference a the file.
	35	# It is a subclass of SplitTextFile, so if there are multiple records, all
[1676]	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
[1676]	42
[3587]	43
[15872]	44	package BibTexPlugin;
[1676]	45
[15872]	46	use SplitTextFile;
[24548]	47	use MetadataRead;
[10254]	48	use strict;
	49	no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]	50
[15872]	51	# BibTexPlugin is a sub-class of SplitTextFile.
[1676]	52	sub BEGIN {
[24548]	53	@BibTexPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[1676]	54	}
	55
[3540]	56	my $arguments =
[4744]	57	[ { 'name' => "process_exp",
[15872]	58	'desc' => "{BasePlugin.process_exp}",
[6408]	59	'type' => "regexp",
[4873]	60	'reqd' => "no",
[6408]	61	'deft' => &get_default_process_exp() },
	62	{ 'name' => "split_exp",
[15872]	63	'desc' => "{SplitTextFile.split_exp}",
[6408]	64	'type' => "regexp",
	65	'deft' => &get_default_split_exp(),
	66	'reqd' => "no" }
	67	];
[3540]	68
[15872]	69	my $options = { 'name' => "BibTexPlugin",
	70	'desc' => "{BibTexPlugin.desc}",
[6408]	71	'abstract' => "no",
	72	'inherits' => "yes",
[9120]	73	'explodes' => "yes",
[4744]	74	'args' => $arguments };
[3540]	75
[1676]	76	# This plugin processes files with the suffix ".bib"
	77	sub get_default_process_exp {
	78	return q^(?i)\.bib$^;
	79	}
	80
	81	# This plugin splits the input text at blank lines
	82	sub get_default_split_exp {
	83	return q^\n+(?=@)^;
	84	}
[15872]	85
[3540]	86	sub new {
[10218]	87	my ($class) = shift (@_);
	88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	89	push(@$pluginlist, $class);
	90
[15872]	91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	92	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	93
[15872]	94	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	95
[3540]	96	return bless $self, $class;
	97	}
[1676]	98
[2901]	99
	100
[1676]	101	# The process function reads a single bibliographic record and stores
	102	# it as a new document.
	103
	104	sub process {
	105	my $self = shift (@_);
[6332]	106	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]	107	my $outhandle = $self->{'outhandle'};
	108
[10254]	109	my $cursection = $doc_obj->get_top_section();
[2484]	110	$self->{'key'} = "default";
	111
[1676]	112	# Check that we're dealing with a valid BibTex record
	113	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	114
	115	# Ignore things we can't use
	116	return 0 if ($$textref =~ /^\@String/);
	117
[1711]	118	# This hash translates BibTex field names into metadata names. The
[1676]	119	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	120	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	121
[9582]	122	# The author metadata will be stored as one "Creator" entry, but will
	123	# also be split up into several individual "Author" fields.
	124
[1676]	125	my %field = (
	126	'address', 'PublisherAddress',
	127	'author', 'Creator',
[2901]	128
[1676]	129	'booktitle', 'Booktitle',
	130	'chapter', 'Chapter',
	131	'edition', 'Edition',
[2901]	132	'editor', 'Editor',
[1676]	133	'institution', 'Publisher',
	134	'journal', 'Journal',
	135	'month', 'Month',
	136	'number', 'Number',
	137	'pages', 'Pages',
	138	'publisher', 'Publisher',
	139	'school', 'Publisher',
	140	'title', 'Title',
	141	'volume', 'Volume',
[3142]	142	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]	143
	144	'keywords', 'Keywords',
	145	'abstract', 'Abstract',
[4792]	146	'copyright', 'Copyright',
	147	'note', 'Note',
[11527]	148	'url', 'URL',
[8121]	149	);
[1676]	150
	151	# Metadata fields
	152	my %metadata;
[4792]	153	my ($EntryType, $Creator, $Keywords, $text);
[1676]	154
[2901]	155	my $verbosity = $self->{'verbosity'};
	156	$verbosity = 0 unless $verbosity;
	157
[3142]	158	# Make sure the text has exactly one entry per line -
	159	# append line to previous if it doesn't start with " <key> = "
[2901]	160
[4792]	161	my @input_lines=split(/\r?\n/, $$textref);
[3142]	162	my @all_lines;
	163	my $entry_line=shift @input_lines;
	164	foreach my $input_line (@input_lines) {
	165	if ($input_line =~ m/^\s\w+\s=\s*/) {
	166	# this is a new key
	167	push(@all_lines, $entry_line);
	168	$entry_line=$input_line;
	169	} else {
	170	# this is a continuation of previous line
[3249]	171	$entry_line .= " " . $input_line;
[3142]	172	}
	173
	174	}
	175	# add final line, removing trailing '}'
	176	$entry_line =~ s/\}\s*$//;
	177	push(@all_lines, $entry_line);
	178	push(@all_lines, "}");
[2901]	179
[1676]	180	# Read and process each line in the bib file.
[2901]	181	my ($entryname, $name, $value, $line);
	182	foreach $line (@all_lines) {
	183
[1676]	184	# Add each line. Most lines consist of a field identifer and
	185	# then data, and we simply store them, though we treat some
	186	# of the fields a bit differently.
	187
	188	$line =~ s/\s+/ /g;
	189	$text .= "$line\n";
	190
[4792]	191
[2901]	192	print "Processing line = $line \n" if $verbosity>=4;
	193
[1676]	194	# The first line is special, it contains the reference type and OID
[4792]	195	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
[1676]	196	$EntryType = $1;
[4792]	197	my $EntryID = (defined $2) ? $2 : "default";
[2484]	198	print "** $EntryType - \"$EntryID\" \n"
	199	if ($verbosity >= 4);
[4792]	200
[1676]	201	next;
	202	}
[2484]	203	if ($line =~ /\@/) {
[2901]	204	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	205	if ($verbosity >= 2);
[2901]	206	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	207	if ($verbosity >= 2);
	208	}
[1676]	209
	210	# otherwise, parse the metadata out of this line
[9582]	211	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
[2901]	212	$entryname = lc($1);
[1676]	213	$value = $2;
[3112]	214	$value =~ s/,?\s*$//; # remove trailing comma and space
	215	if ($value =~ /^"/ && $value =~ /"$/) {
	216	# remove surrounding " marks
	217	$value =~ s/^"//; $value =~ s/"$//;
[13416]	218	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
	219	# remove surrounding {} marks
	220	$value =~ s/^\{//; $value =~ s/\}$//;
[3112]	221	}
[13413]	222	# special case for year - we only want a 4 digit year, not
	223	# "circa 2004" etc
	224	if ($entryname=~ /^year$/i) {
	225	if ($value=~ /(\d{4})/) {
	226	$value=$1;
	227	$metadata{$entryname} .= "$value";
	228	}
	229	}
	230	else {
	231
	232	$value = &process_latex($value);
	233	# Add this line of metadata
	234	$metadata{$entryname} .= "$value";
	235	}
[1676]	236	}
	237
[1677]	238	# Add the Entry type as metadata
[2901]	239	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]	240
	241	#Add the fileformat as metadata
	242	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]	243
[8121]	244
[1676]	245	# Add the various field as metadata
[2901]	246	foreach my $entryname (keys %metadata) {
	247	next unless (defined $field{$entryname});
	248	next unless (defined $metadata{$entryname});
[1676]	249
[2901]	250	$name = $field{$entryname};
	251	$value = $metadata{$entryname};
[1676]	252
[3142]	253	if ($name =~ /^Month/) {
	254	$value=expand_month($value);
	255	}
[1676]	256
	257	# Several special operatons on metadata follow
	258
	259	# Add individual keywords.
	260	# The full set of keywords will be added, in due course, as "Keywords".
	261	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	262	if ($entryname eq "keywords") {
[1676]	263	my @keywordlist = split(/,/, $value);
	264	foreach my $k (@keywordlist) {
[2901]	265	$k = lc($k);
[1676]	266	$k =~ s/\s*$//;
	267	$k =~ s/^\s*//;
	268	if ($k =~ /\w/) {
[2901]	269	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	270	}
	271	}
	272	}
	273
	274	# Add individual authors
	275	# The author metadata will be stored as one "Creator" entry, but we
	276	# also want to split it into several individual "Author" fields in
[3112]	277	# "Lastname, Firstnames" format so we can browse it.
[2901]	278	if ($entryname eq "author") { #added also comparison with editor
	279
[3249]	280	# take care of "et al."...
[9582]	281	my $etal='';
	282	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
	283	$etal=' <em>et. al.</em>';
	284	}
[2901]	285	# und here for german language...
	286	# don't use brackets in pattern, else the matched bit becomes
	287	# an element in the list!
[3249]	288	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
	289	my @formattedlist = ();
[1676]	290	foreach $a (@authorlist) {
	291	$a =~ s/\s*$//;
	292	$a =~ s/^\s*//;
	293	# Reformat and add author name
[2901]	294	next if $a=~ /^\s*$/;
[3249]	295
	296	# names are "First von Last", "von Last, First"
	297	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	298	my $first="";
	299	my $vonlast="";
	300	my $jr="";
[2901]	301
[3249]	302	if ($a =~ /,/) {
	303	my @parts=split(/,\s*/, $a);
	304	$first = pop @parts;
	305	if (scalar(@parts) == 2) {
	306	$jr = pop @parts;
	307	}
	308	$vonlast=shift @parts;
	309	if (scalar(@parts) > 0) {
[15872]	310	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
[3249]	311	# but we continue anyway...
	312	}
	313	} else { # First von Last
	314	my @words = split(/ /, $a);
	315	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
	316	$first .= " " . shift (@words);
	317	}
	318	$first =~ s/^\s//;
	319	$vonlast = join (' ', @words); # whatever's left...
	320	}
	321	my $von="";
	322	my $last="";
	323	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	324	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	325	$von = $1;
	326	if (!defined ($von)) {
	327	# some non-English names do start with lowercase
	328	# eg "Marie desJardins". Also we can get typos...
[15872]	329	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
[3249]	330	$von="";
	331	if ($vonlast =~ /^[a-z]+$/) {
	332	# if it's all lowercase, uppercase 1st.
	333	$vonlast =~ s/^(.)/\u$1/;
[2901]	334
[3249]	335	}
	336	}
	337	$von =~ s/\s*$//;
	338	$last=$vonlast;
	339	} else {
	340	$last=$vonlast;
	341	}
	342	my $wholename="$first $von $last $jr";
[9582]	343	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
	344	$wholename =~ s/ $//;
[3249]	345	push (@formattedlist, $wholename);
	346	my $fullname = "$last";
	347	$fullname .= " $jr" if ($jr);
	348	$fullname .= ", $first";
	349	$fullname .= " $von" if ($von);
	350
[1676]	351	# Add each name to set of Authors
[3249]	352	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	353	}
[3249]	354
	355	# Only want at most one "and" in the Creator field
	356	if (scalar(@formattedlist) > 2) {
	357	my $lastauthor=pop @formattedlist;
	358	$value=join(', ', @formattedlist);
	359	$value.=" and $lastauthor";
	360	} else { # 1 or 2 authors...
	361	$value=join(" and ",@formattedlist);
	362	}
	363	$value.=$etal; # if there was "et al."
[1676]	364	}
	365
	366	# Books and Journals are additionally marked for display purposes
[2901]	367	if ($entryname eq "booktitle") {
	368	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	369	} elsif ($entryname eq "journal") {
	370	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	371	}
	372
[3249]	373	# Add the various fields as metadata
	374	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]	375
[1676]	376	}
	377
[9582]	378	# for books and journals...
	379	if (!exists $metadata{'title'}) {
	380	my $name=$field{'title'}; # get Greenstone metadata name
	381	my $value;
	382	if (exists $metadata{'booktitle'}) {
	383	$value=$metadata{'booktitle'};
	384	} elsif (exists $metadata{'journal'}) {
	385	$value=$metadata{'journal'};
	386	}
	387	if ($value) {
	388	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
	389	}
	390	}
	391
[3249]	392	# Add Date (yyyymmdd) metadata
	393	if (defined ($metadata{'year'}) ) {
	394	my $date=$metadata{'year'};
	395	chomp $date;
	396	my $month=$metadata{'month'};
	397	if (defined($month)) {
	398	# month is currently 3 letter code or a range...
	399	$month = expand_month($month);
	400	# take the first month found... might not find one!
	401	$month =~ m/_textmonth(\d\d)_/;
	402	$month = $1;
	403	}
	404	if (!defined($month)) {
	405	$month="00";
	406	}
	407	$date .= "${month}00";
	408	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	409	}
	410
	411	# # Add the text in BibTex format (all fields)
[1676]	412	if ($text =~ /\w/) {
[3249]	413
	414	$text =~ s@&@&@g;
	415	$text =~ s@<@<@g;
	416	$text =~ s@>@>@g;
	417	$text =~ s@\n@<br/>\n@g;
	418	$text =~ s@\\@\\\\@g;
	419
[2901]	420	$doc_obj->add_utf8_text ($cursection, $text);
[1676]	421	}
	422
[1677]	423	return 1;
[1676]	424	}
	425
[1677]	426
[2901]	427
	428
	429	# convert email addresses and URLs into links
	430	sub convert_urls_into_links{
	431	my ($text) = @_;
	432
	433	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]	434	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]	435
	436	return $text;
	437	}
	438
	439	# Clean up whitespace and convert \n charaters to <BR> or <P>
	440	sub clean_up_whitespaces{
	441	my ($text) = @_;
	442
	443	$text =~ s/%%%%%/<BR> <BR>/g;
	444	$text =~ s/ +/ /g;
	445	$text =~ s/\s*$//;
	446	$text =~ s/^\s*//;
	447	$text =~ s/\n/\n<BR>/g;
	448	$text =~ s/<BR>\s*<BR>/<P>/g;
	449
	450	return $text;
	451	}
	452
	453
	454	sub convert_problem_characters_without_ampersand{
	455	my ($text) = @_;
	456	$text =~ s/</</g;
	457	$text =~ s/>/>/g;
	458
	459	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	460	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	461
	462	$text =~ s/\"/"/g;
	463	$text =~ s/\'/’/g;
	464	$text =~ s/\`/‘/g;
	465
[3249]	466	# $text =~ s/\+/ /g;
	467	# $text =~ s/\(/ /g;
	468	# $text =~ s/\)/ /g;
	469
[2901]	470	$text =~ s/\\/\\\\/g;
	471
[3249]	472	# $text =~ s/\./\\\./g;
[2901]	473
	474	return $text;
	475	}
	476
[1676]	477	# Convert a text string into HTML.
	478
[1677]	479	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	480	# careful not to use symbols like ">", which occurs frequently in email
[1677]	481	# messages (and use &gt instead.
	482
	483	# This function also turns URLs and email addresses into links, and
	484	# replaces carriage returns with <BR> tags (and multiple carriage returns
	485	# with <P> tags).
	486
[1676]	487	sub text_into_html {
	488	my ($text) = @_;
	489
[2901]	490	# Convert problem characters into HTML symbols
[1676]	491	$text =~ s/&/&/g;
	492
[2901]	493	$text = &convert_problem_characters_without_ampersand( $text );
	494
[1676]	495	# convert email addresses and URLs into links
[2901]	496	$text = &convert_urls_into_links( $text );
[1676]	497
[2901]	498	$text = &clean_up_whitespaces( $text );
[1676]	499
	500	return $text;
	501	}
	502
[2901]	503
[3142]	504	sub expand_month {
	505	my $text=shift;
[2901]	506
[3142]	507	# bibtex style files expand abbreviations for months.
	508	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	509	$text =~ s/jan/_textmonth01_/g;
	510	$text =~ s/feb/_textmonth02_/g;
	511	$text =~ s/mar/_textmonth03_/g;
	512	$text =~ s/apr/_textmonth04_/g;
	513	$text =~ s/may/_textmonth05_/g;
	514	$text =~ s/jun/_textmonth06_/g;
	515	$text =~ s/jul/_textmonth07_/g;
	516	$text =~ s/aug/_textmonth08_/g;
	517	$text =~ s/sep/_textmonth09_/g;
	518	$text =~ s/oct/_textmonth10_/g;
	519	$text =~ s/nov/_textmonth11_/g;
	520	$text =~ s/dec/_textmonth12_/g;
[2901]	521
[3142]	522	return $text;
	523	}
	524
	525
[2901]	526	# Convert accented characters, remove { }, interprete some commands....
	527	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	528	sub process_latex {
	529	my ($text) = @_;
	530
	531	# note - this is really ugly, but it works. There may be a prettier way
	532	# of mapping latex accented chars to utf8, but we just brute force it here.
	533	# Also, this isn't complete - not every single possible accented letter
	534	# is in here yet, but most of the common ones are.
	535
	536	my %utf8_chars =
	537	(
	538	# acutes
	539	'\'a' => chr(0xc3).chr(0xa1),
	540	'\'c' => chr(0xc4).chr(0x87),
	541	'\'e' => chr(0xc3).chr(0xa9),
	542	'\'i' => chr(0xc3).chr(0xad),
	543	'\'l' => chr(0xc3).chr(0xba),
	544	'\'n' => chr(0xc3).chr(0x84),
	545	'\'o' => chr(0xc3).chr(0xb3),
	546	'\'r' => chr(0xc5).chr(0x95),
	547	'\'s' => chr(0xc5).chr(0x9b),
	548	'\'u' => chr(0xc3).chr(0xba),
	549	'\'y' => chr(0xc3).chr(0xbd),
	550	'\'z' => chr(0xc5).chr(0xba),
	551	# graves
	552	'`a' => chr(0xc3).chr(0xa0),
	553	'`A' => chr(0xc3).chr(0x80),
	554	'`e' => chr(0xc3).chr(0xa8),
	555	'`E' => chr(0xc3).chr(0x88),
	556	'`i' => chr(0xc3).chr(0xac),
	557	'`I' => chr(0xc3).chr(0x8c),
	558	'`o' => chr(0xc3).chr(0xb2),
	559	'`O' => chr(0xc3).chr(0x92),
	560	'`u' => chr(0xc3).chr(0xb9),
	561	'`U' => chr(0xc3).chr(0x99),
	562	# circumflex
	563	'^a' => chr(0xc3).chr(0xa2),
	564	'^A' => chr(0xc3).chr(0x82),
	565	'^c' => chr(0xc4).chr(0x89),
	566	'^C' => chr(0xc4).chr(0x88),
	567	'^e' => chr(0xc3).chr(0xaa),
	568	'^E' => chr(0xc3).chr(0x8a),
	569	'^g' => chr(0xc4).chr(0x9d),
	570	'^G' => chr(0xc4).chr(0x9c),
	571	'^h' => chr(0xc4).chr(0xa5),
	572	'^H' => chr(0xc4).chr(0xa4),
	573	'^i' => chr(0xc3).chr(0xae),
	574	'^I' => chr(0xc3).chr(0x8e),
	575	'^j' => chr(0xc4).chr(0xb5),
	576	'^J' => chr(0xc4).chr(0xb4),
	577	'^o' => chr(0xc3).chr(0xb4),
	578	'^O' => chr(0xc3).chr(0x94),
	579	'^s' => chr(0xc5).chr(0x9d),
	580	'^S' => chr(0xc5).chr(0x9c),
	581	'^u' => chr(0xc3).chr(0xa2),
	582	'^U' => chr(0xc3).chr(0xbb),
	583	'^w' => chr(0xc5).chr(0xb5),
	584	'^W' => chr(0xc5).chr(0xb4),
	585	'^y' => chr(0xc5).chr(0xb7),
	586	'^Y' => chr(0xc5).chr(0xb6),
	587
	588	# diaeresis
	589	'"a' => chr(0xc3).chr(0xa4),
	590	'"A' => chr(0xc3).chr(0x84),
	591	'"e' => chr(0xc3).chr(0xab),
	592	'"E' => chr(0xc3).chr(0x8b),
	593	'"\\\\i' => chr(0xc3).chr(0xaf),
	594	'"\\\\I' => chr(0xc3).chr(0x8f),
	595	'"o' => chr(0xc3).chr(0xb6),
	596	'"O' => chr(0xc3).chr(0x96),
	597	'"u' => chr(0xc3).chr(0xbc),
	598	'"U' => chr(0xc3).chr(0x9c),
	599	'"y' => chr(0xc3).chr(0xbf),
	600	'"Y' => chr(0xc3).chr(0xb8),
	601	# tilde
[3142]	602	'~A' => chr(0xc3).chr(0x83),
	603	'~N' => chr(0xc3).chr(0x91),
	604	'~O' => chr(0xc3).chr(0x95),
	605	'~a' => chr(0xc3).chr(0xa3),
	606	'~n' => chr(0xc3).chr(0xb1),
	607	'~o' => chr(0xc3).chr(0xb5),
[2901]	608	# caron - handled specially
	609	# double acute
	610	# ring
	611	# dot
[3249]	612	'.c' => chr(0xc4).chr(0x8b),
	613	'.C' => chr(0xc4).chr(0x8a),
	614	'.e' => chr(0xc4).chr(0x97),
	615	'.E' => chr(0xc4).chr(0x96),
	616	'.g' => chr(0xc4).chr(0xa1),
	617	'.G' => chr(0xc4).chr(0xa0),
	618	'.I' => chr(0xc4).chr(0xb0),
	619	'.z' => chr(0xc5).chr(0xbc),
	620	'.Z' => chr(0xc5).chr(0xbb),
[2901]	621	# macron
	622	'=a' => chr(0xc4).chr(0x81),
	623	'=A' => chr(0xc4).chr(0x80),
	624	'=e' => chr(0xc4).chr(0x93),
	625	'=E' => chr(0xc4).chr(0x92),
	626	'=i' => chr(0xc4).chr(0xab),
	627	'=I' => chr(0xc4).chr(0xaa),
	628	'=o' => chr(0xc4).chr(0x8d),
	629	'=O' => chr(0xc4).chr(0x8c),
	630	'=u' => chr(0xc4).chr(0xab),
	631	'=U' => chr(0xc4).chr(0xaa),
	632
	633	# stroke - handled specially - see below
	634
	635	# cedilla - handled specially
	636	);
	637
	638	# these are one letter latex commands - we make sure they're not a longer
	639	# command name. eg {\d} is d+stroke, so careful of \d
	640	my %special_utf8_chars =
	641	(
[3156]	642	# breve
	643	'u g' => chr(0xc4).chr(0x9f),
	644	'u G' => chr(0xc4).chr(0x9e),
	645	'u i' => chr(0xc4).chr(0xad),
	646	'u I' => chr(0xc4).chr(0xac),
	647	'u o' => chr(0xc5).chr(0x8f),
	648	'u O' => chr(0xc5).chr(0x8e),
	649	'u u' => chr(0xc5).chr(0xad),
	650	'u U' => chr(0xc5).chr(0xac),
[2901]	651	# caron
[3156]	652	'v c' => chr(0xc4).chr(0x8d),
	653	'v C' => chr(0xc4).chr(0x8c),
[2901]	654	'v n' => chr(0xc5).chr(0x88),
	655	'v N' => chr(0xc5).chr(0x87),
	656	'v s' => chr(0xc5).chr(0xa1),
	657	'v S' => chr(0xc5).chr(0xa5),
[3156]	658	'v z' => chr(0xc5).chr(0xbe),
	659	'v Z' => chr(0xc5).chr(0xbd),
[2901]	660	# cedilla
	661	'c c' => chr(0xc3).chr(0xa7),
	662	'c C' => chr(0xc3).chr(0x87),
	663	'c g' => chr(0xc4).chr(0xa3),
	664	'c G' => chr(0xc4).chr(0xa2),
	665	'c k' => chr(0xc4).chr(0xb7),
	666	'c K' => chr(0xc4).chr(0xb6),
	667	'c l' => chr(0xc4).chr(0xbc),
	668	'c L' => chr(0xc4).chr(0xbb),
	669	'c n' => chr(0xc5).chr(0x86),
	670	'c N' => chr(0xc5).chr(0x85),
	671	'c r' => chr(0xc5).chr(0x97),
	672	'c R' => chr(0xc5).chr(0x96),
	673	'c s' => chr(0xc5).chr(0x9f),
	674	'c S' => chr(0xc5).chr(0x9e),
	675	'c t' => chr(0xc5).chr(0xa3),
	676	'c T' => chr(0xc5).chr(0xa2),
	677	# double acute / Hungarian accent
	678	'H O' => chr(0xc5).chr(0x90),
	679	'H o' => chr(0xc5).chr(0x91),
	680	'H U' => chr(0xc5).chr(0xb0),
	681	'H u' => chr(0xc5).chr(0xb1),
	682
	683	# stroke
	684	'd' => chr(0xc4).chr(0x91),
	685	'D' => chr(0xc4).chr(0x90),
	686	'h' => chr(0xc4).chr(0xa7),
	687	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	688	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	689	'l' => chr(0xc5).chr(0x82),
	690	'L' => chr(0xc5).chr(0x81),
	691	'o' => chr(0xc3).chr(0xb8),
	692	'O' => chr(0xc3).chr(0x98),
	693	't' => chr(0xc5).chr(0xa7),
	694	'T' => chr(0xc5).chr(0xa6),
	695	# german ss/szlig/sharp s
	696	'ss' => chr(0xc3).chr(0x9f),
	697	);
	698
	699	# convert latex-style accented characters.
[3249]	700
[2901]	701	# remove space (if any) between \ and letter to accent (eg {\' a})
	702
[13413]	703	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
	704
[2901]	705	# remove {} around a single character (eg \'{e})
[13413]	706	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]	707
[3249]	708	# \, is another way of doing cedilla \c
	709	$text =~ s@\\,(.)@\\c $1@g;
	710
[2901]	711	# remove {} around a single character for special 1 letter commands -
	712	# need to insert a space. Eg \v{s} -> {\v s}
[3156]	713	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]	714
[2901]	715	# only do if the text contains a '\' character.
	716	if ($text =~ m\|\\\|) {
[3249]	717	# "normal" accents - ie non-alpha latex tag
[13413]	718	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]	719	my $tex="$1$2"; my $char="$2";
	720	my $replacement=$utf8_chars{$tex};
	721	if (!defined($replacement)) {
[15872]	722	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	723	$replacement=$char;
	724	}
[5454]	725	$text =~ s/\\\Q$tex/$replacement/g;
[3249]	726	}
[2901]	727
[3249]	728	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	729	# only do the change if immediately followed by a space, }, {, or \
	730	# one letter accents ( + ss)
	731	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
	732	my $tex=$1;
	733	my $replacement=$special_utf8_chars{$tex};
	734	if (!defined($replacement)) {
[15872]	735	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	736	$replacement=$tex;
	737	}
	738	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	739
	740	}
	741
	742	# one letter latex accent commands that affect following letter
	743	while ($text =~ m@\\([uvcH]) ([\w])@) {
	744	my $tex="$1 $2"; my $char="$2";
	745	my $replacement=$special_utf8_chars{$tex};
	746	if (!defined($replacement)) {
[15872]	747	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]	748	$replacement=$char;
	749	}
	750	$text =~ s/\\$tex/$replacement/g;
	751	}
[2901]	752	}
[3249]	753
	754	# escape html-sensitive characters
	755	$text =~ s@&@&@g;
	756	$text =~ s@<@<@g;
	757	$text =~ s@>@>@g;
	758	$text =~ s/''/"/g; # Latex-specific
	759	$text =~ s/``/"/g; # Latex-specific
	760	# greenstone-specific
	761	$text =~ s@\[@&\#91;@g;
	762	$text =~ s@\]@&\#93;@g;
	763
[3112]	764	# remove latex commands
[3249]	765
	766	# explicitly recognised commands
	767	$text =~ s@\\ldots@…@g;
	768
	769	# maths mode
	770	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
	771
	772	# remove all other commands with optional arguments...
[3156]	773	$text =~ s@\\\w+(\[.?\])?\s@@g;
	774	# $text =~ s@\\noopsort{[^}]+\}@@g;
	775	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
[3142]	776
[2901]	777	# remove latex groupings { } (but not \{ or \} )
[3249]	778	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]	779	$text =~ s/^\{//; # remove { if first char
	780
	781	# latex characters
	782	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]	783	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]	784	# optional break "\-"
[3249]	785	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]	786	# the non-macro bits have quotes around them - we just remove quotes
[3249]	787	$text =~ s/([^&])[\"\#]/$1/g;
[3142]	788	}
[3249]	789	# dashes. Convert (m\|n)-dash into single dash for html.
	790	$text =~ s@\-\-+@\-@g;
	791
[2901]	792	# quoted { } chars
	793	$text =~ s@\\{@{@g;
	794	$text =~ s@\\}@}@g;
	795
[3249]	796	# finally to protect against macro language...
	797	$text =~ s@\\@\\\\@g;
	798
[2901]	799	return $text;
	800	}
	801
	802
[3249]	803	sub process_latex_math {
	804	my $text = shift;
	805
	806	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
	807	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
	808	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
	809	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
	810	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
	811
	812	# put all other command names in italics
	813	$text =~ s@\\([\w]+)@<i>$1</i>@g;
	814
	815	return $text;
	816	}
	817
[17026]	818	sub add_OID {
[2484]	819	my $self = shift (@_);
	820	my ($doc_obj, $id, $segment_number) = @_;
	821
	822	if ( $self->{'key'} eq "default") {
[17026]	823	$self->SUPER::add_OID(@_);
	824	# $doc_obj->set_OID("$id\_$segment_number");
[2484]	825	} else {
	826	$doc_obj->set_OID($self->{'key'});
	827	}
	828	}
[1676]	829
[2484]	830	1;
[4792]	831
	832

Note: See TracBrowser for help on using the repository browser.

Download in other formats: