Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

BibTexPlugin.pm

Last change on this file was 38802, checked in by kjdon, 8 weeks ago
when investigating how OIDs are set I noticed we have this add_OID code which is never called - the new name should be add_segment_OID. Secondly, back in 2017 setting the self->{'key'} was removed - accidentally? I have put it back. This is so that is a citekey is included in hte bibtex entry, we can use that for the doc ID
Property svn:keywords set to `Author Date Id Revision`
File size: 23.7 KB

Line
1	###########################################################################
2	#
3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlugin reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlugin creates a document object for every reference a the file.
35	# It is a subclass of SplitTextFile, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlugin;
45
46	use SplitTextFile;
47	use MetadataRead;
48	use strict;
49	no strict 'refs'; # allow filehandles to be variables and viceversa
50
51	# BibTexPlugin is a sub-class of SplitTextFile.
52	sub BEGIN {
53	@BibTexPlugin::ISA = ('MetadataRead', 'SplitTextFile');
54	}
55
56	my $arguments =
57	[ { 'name' => "process_exp",
58	'desc' => "{BaseImporter.process_exp}",
59	'type' => "regexp",
60	'reqd' => "no",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "split_exp",
63	'desc' => "{SplitTextFile.split_exp}",
64	'type' => "regexp",
65	'deft' => &get_default_split_exp(),
66	'reqd' => "no" }
67	];
68
69	my $options = { 'name' => "BibTexPlugin",
70	'desc' => "{BibTexPlugin.desc}",
71	'abstract' => "no",
72	'inherits' => "yes",
73	'explodes' => "yes",
74	'args' => $arguments };
75
76	# This plugin processes files with the suffix ".bib"
77	sub get_default_process_exp {
78	return q^(?i)\.bib$^;
79	}
80
81	# This plugin splits the input text at blank lines
82	sub get_default_split_exp {
83	return q^\n+(?=@)^;
84	}
85
86	sub new {
87	my ($class) = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
95
96	return bless $self, $class;
97	}
98
99
100
101	# The process function reads a single bibliographic record and stores
102	# it as a new document.
103
104	sub process {
105	my $self = shift (@_);
106	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
107	my $outhandle = $self->{'outhandle'};
108
109	my $cursection = $doc_obj->get_top_section();
110	$self->{'key'} = "default";
111
112	# Check that we're dealing with a valid BibTex record
113	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
114
115	# Ignore things we can't use
116	return 0 if ($$textref =~ /^\@String/);
117
118	# This hash translates BibTex field names into metadata names. The
119	# BibTex names are taken from the "Local Guide to Latex" Graeme
120	# McKinstry. Metadata names are consistent with ReferPlug.
121
122	# The author metadata will be stored as one "Creator" entry, but will
123	# also be split up into several individual "Author" fields.
124
125	my %field = (
126	'address', 'PublisherAddress',
127	'author', 'Creator',
128
129	'booktitle', 'Booktitle',
130	'chapter', 'Chapter',
131	'edition', 'Edition',
132	'editor', 'Editor',
133	'institution', 'Publisher',
134	'journal', 'Journal',
135	'month', 'Month',
136	'number', 'Number',
137	'pages', 'Pages',
138	'publisher', 'Publisher',
139	'school', 'Publisher',
140	'title', 'Title',
141	'volume', 'Volume',
142	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
143
144	'keywords', 'Keywords',
145	'abstract', 'Abstract',
146	'copyright', 'Copyright',
147	'note', 'Note',
148	'url', 'URL',
149	);
150
151	# Metadata fields
152	my %metadata;
153	my ($EntryType, $Creator, $Keywords, $text);
154
155	my $verbosity = $self->{'verbosity'};
156	$verbosity = 0 unless $verbosity;
157
158	# Make sure the text has exactly one entry per line -
159	# append line to previous if it doesn't start with " <key> = "
160
161	my @input_lines=split(/\r?\n/, $$textref);
162	my @all_lines;
163	my $entry_line=shift @input_lines;
164	foreach my $input_line (@input_lines) {
165	if ($input_line =~ m/^\s\w+\s=\s*/) {
166	# this is a new key
167	push(@all_lines, $entry_line);
168	$entry_line=$input_line;
169	} else {
170	# this is a continuation of previous line
171	$entry_line .= " " . $input_line;
172	}
173
174	}
175	# add final line, removing trailing '}'
176	$entry_line =~ s/\}\s*$//;
177	push(@all_lines, $entry_line);
178	push(@all_lines, "}");
179
180	# Read and process each line in the bib file.
181	my ($entryname, $name, $value, $line);
182	foreach $line (@all_lines) {
183
184	# Add each line. Most lines consist of a field identifer and
185	# then data, and we simply store them, though we treat some
186	# of the fields a bit differently.
187
188	$line =~ s/\s+/ /g;
189	$text .= "$line\n";
190
191
192	print STDERR "Processing line = $line \n" if $verbosity>=4;
193
194	# The first line is special, it contains the reference type and optional OID
195	# oid (citekey) can contain alphanumeric, _, - and :
196	if ($line =~ /\@(\w+)\W\{\W([\\.\w:-])\W*$/) {
197	$EntryType = $1; # eg @workingpaper
198	my $EntryID = $2; # eg {Witen2020,
199	print "** $EntryType - \"$EntryID\" \n" if ($verbosity >= 4);
200	if (defined $EntryID && $EntryID =~ /\w+/) {
201	$self->{'key'} = $EntryID;
202	}
203	next;
204	}
205	if ($line =~ /\@/) {
206	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
207	if ($verbosity >= 2);
208	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
209	if ($verbosity >= 2);
210	}
211
212	# otherwise, parse the metadata out of this line
213	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
214	$entryname = lc($1);
215	$value = $2;
216	$value =~ s/,?\s*$//; # remove trailing comma and space
217	if ($value =~ /^"/ && $value =~ /"$/) {
218	# remove surrounding " marks
219	$value =~ s/^"//; $value =~ s/"$//;
220	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
221	# remove surrounding {} marks
222	$value =~ s/^\{//; $value =~ s/\}$//;
223	}
224	# special case for year - we only want a 4 digit year, not
225	# "circa 2004" etc
226	if ($entryname=~ /^year$/i) {
227	if ($value=~ /(\d{4})/) {
228	$value=$1;
229	$metadata{$entryname} .= "$value";
230	}
231	}
232	else {
233
234	$value = &process_latex($value);
235	# Add this line of metadata
236	$metadata{$entryname} .= "$value";
237	}
238	}
239
240	# Add the Entry type as metadata
241	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
242
243	#Add the fileformat as metadata
244	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
245
246
247	# Add the various field as metadata
248	foreach my $entryname (keys %metadata) {
249	next unless (defined $field{$entryname});
250	next unless (defined $metadata{$entryname});
251
252	$name = $field{$entryname};
253	$value = $metadata{$entryname};
254
255	if ($name =~ /^Month/) {
256	$value=expand_month($value);
257	}
258
259	# Several special operatons on metadata follow
260
261	# Add individual keywords.
262	# The full set of keywords will be added, in due course, as "Keywords".
263	# However, we also want to add them as individual "Keyword" metadata elements.
264	if ($entryname eq "keywords") {
265	my @keywordlist = split(/,/, $value);
266	foreach my $k (@keywordlist) {
267	$k = lc($k);
268	$k =~ s/\s*$//;
269	$k =~ s/^\s*//;
270	if ($k =~ /\w/) {
271	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
272	}
273	}
274	}
275
276	# Add individual authors
277	# The author metadata will be stored as one "Creator" entry, but we
278	# also want to split it into several individual "Author" fields in
279	# "Lastname, Firstnames" format so we can browse it.
280	if ($entryname eq "author") { #added also comparison with editor
281
282	# take care of "et al."...
283	my $etal='';
284	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
285	$etal=' <em>et. al.</em>';
286	}
287	# und here for german language...
288	# don't use brackets in pattern, else the matched bit becomes
289	# an element in the list!
290	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
291	my @formattedlist = ();
292	foreach $a (@authorlist) {
293	$a =~ s/\s*$//;
294	$a =~ s/^\s*//;
295	# Reformat and add author name
296	next if $a=~ /^\s*$/;
297
298	# names are "First von Last", "von Last, First"
299	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
300	my $first="";
301	my $vonlast="";
302	my $jr="";
303
304	if ($a =~ /,/) {
305	my @parts=split(/,\s*/, $a);
306	$first = pop @parts;
307	if (scalar(@parts) == 2) {
308	$jr = pop @parts;
309	}
310	$vonlast=shift @parts;
311	if (scalar(@parts) > 0) {
312	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
313	# but we continue anyway...
314	}
315	} else { # First von Last
316	my @words = split(/ /, $a);
317	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2,}/) {
318	$first .= " " . shift (@words);
319	}
320	$first =~ s/^\s//;
321	$vonlast = join (' ', @words); # whatever's left...
322	}
323	my $von="";
324	my $last="";
325	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
326	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
327	$von = $1;
328	if (!defined ($von)) {
329	# some non-English names do start with lowercase
330	# eg "Marie desJardins". Also we can get typos...
331	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
332	$von="";
333	if ($vonlast =~ /^[a-z]+$/) {
334	# if it's all lowercase, uppercase 1st.
335	$vonlast =~ s/^(.)/\u$1/;
336
337	}
338	}
339	$von =~ s/\s*$//;
340	$last=$vonlast;
341	} else {
342	$last=$vonlast;
343	}
344	my $wholename="$first $von $last $jr";
345	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
346	$wholename =~ s/ $//;
347	push (@formattedlist, $wholename);
348	my $fullname = "$last";
349	$fullname .= " $jr" if ($jr);
350	$fullname .= ", $first";
351	$fullname .= " $von" if ($von);
352
353	# Add each name to set of Authors
354	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
355	}
356
357	# Only want at most one "and" in the Creator field
358	if (scalar(@formattedlist) > 2) {
359	my $lastauthor=pop @formattedlist;
360	$value=join(', ', @formattedlist);
361	$value.=" and $lastauthor";
362	} else { # 1 or 2 authors...
363	$value=join(" and ",@formattedlist);
364	}
365	$value.=$etal; # if there was "et al."
366	}
367
368	# Books and Journals are additionally marked for display purposes
369	if ($entryname eq "booktitle") {
370	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
371	} elsif ($entryname eq "journal") {
372	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
373	}
374
375	# Add the various fields as metadata
376	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
377
378	}
379
380	# for books and journals...
381	if (!exists $metadata{'title'}) {
382	my $name=$field{'title'}; # get Greenstone metadata name
383	my $value;
384	if (exists $metadata{'booktitle'}) {
385	$value=$metadata{'booktitle'};
386	} elsif (exists $metadata{'journal'}) {
387	$value=$metadata{'journal'};
388	}
389	if ($value) {
390	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
391	}
392	}
393
394	# Add Date (yyyymmdd) metadata
395	if (defined ($metadata{'year'}) ) {
396	my $date=$metadata{'year'};
397	chomp $date;
398	my $month=$metadata{'month'};
399	if (defined($month)) {
400	# month is currently 3 letter code or a range...
401	$month = expand_month($month);
402	# take the first month found... might not find one!
403	$month =~ m/_textmonth(\d\d)_/;
404	$month = $1;
405	}
406	if (!defined($month)) {
407	$month="00";
408	}
409	$date .= "${month}00";
410	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
411	}
412
413	# # Add the text in BibTex format (all fields)
414	if ($text =~ /\w/) {
415
416	$text =~ s@&@&@g;
417	$text =~ s@<@<@g;
418	$text =~ s@>@>@g;
419	$text =~ s@\n@<br/>\n@g;
420	$text =~ s@\\@\\\\@g;
421
422	$doc_obj->add_utf8_text ($cursection, $text);
423	}
424
425	return 1;
426	}
427
428
429
430
431	# convert email addresses and URLs into links
432	sub convert_urls_into_links{
433	my ($text) = @_;
434
435	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
436	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
437
438	return $text;
439	}
440
441	# Clean up whitespace and convert \n charaters to <BR> or <P>
442	sub clean_up_whitespaces{
443	my ($text) = @_;
444
445	$text =~ s/%%%%%/<BR> <BR>/g;
446	$text =~ s/ +/ /g;
447	$text =~ s/\s*$//;
448	$text =~ s/^\s*//;
449	$text =~ s/\n/\n<BR>/g;
450	$text =~ s/<BR>\s*<BR>/<P>/g;
451
452	return $text;
453	}
454
455
456	sub convert_problem_characters_without_ampersand{
457	my ($text) = @_;
458	$text =~ s/</</g;
459	$text =~ s/>/>/g;
460
461	$text =~ s/\'\'/\"/g; #Latex -specific conversion
462	$text =~ s/\`\`/\"/g; #Latex -specific conversion
463
464	$text =~ s/\"/"/g;
465	$text =~ s/\'/’/g;
466	$text =~ s/\`/‘/g;
467
468	# $text =~ s/\+/ /g;
469	# $text =~ s/\(/ /g;
470	# $text =~ s/\)/ /g;
471
472	$text =~ s/\\/\\\\/g;
473
474	# $text =~ s/\./\\\./g;
475
476	return $text;
477	}
478
479	# Convert a text string into HTML.
480
481	# The HTML is going to be inserted into a GML file, so we have to be
482	# careful not to use symbols like ">", which occurs frequently in email
483	# messages (and use &gt instead.
484
485	# This function also turns URLs and email addresses into links, and
486	# replaces carriage returns with <BR> tags (and multiple carriage returns
487	# with <P> tags).
488
489	sub text_into_html {
490	my ($text) = @_;
491
492	# Convert problem characters into HTML symbols
493	$text =~ s/&/&/g;
494
495	$text = &convert_problem_characters_without_ampersand( $text );
496
497	# convert email addresses and URLs into links
498	$text = &convert_urls_into_links( $text );
499
500	$text = &clean_up_whitespaces( $text );
501
502	return $text;
503	}
504
505
506	sub expand_month {
507	my $text=shift;
508
509	# bibtex style files expand abbreviations for months.
510	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
511	$text =~ s/jan/_textmonth01_/g;
512	$text =~ s/feb/_textmonth02_/g;
513	$text =~ s/mar/_textmonth03_/g;
514	$text =~ s/apr/_textmonth04_/g;
515	$text =~ s/may/_textmonth05_/g;
516	$text =~ s/jun/_textmonth06_/g;
517	$text =~ s/jul/_textmonth07_/g;
518	$text =~ s/aug/_textmonth08_/g;
519	$text =~ s/sep/_textmonth09_/g;
520	$text =~ s/oct/_textmonth10_/g;
521	$text =~ s/nov/_textmonth11_/g;
522	$text =~ s/dec/_textmonth12_/g;
523
524	return $text;
525	}
526
527
528	# Convert accented characters, remove { }, interprete some commands....
529	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
530	sub process_latex {
531	my ($text) = @_;
532
533	# note - this is really ugly, but it works. There may be a prettier way
534	# of mapping latex accented chars to utf8, but we just brute force it here.
535	# Also, this isn't complete - not every single possible accented letter
536	# is in here yet, but most of the common ones are.
537
538	my %utf8_chars =
539	(
540	# acutes
541	'\'a' => chr(0xc3).chr(0xa1),
542	'\'c' => chr(0xc4).chr(0x87),
543	'\'e' => chr(0xc3).chr(0xa9),
544	'\'i' => chr(0xc3).chr(0xad),
545	'\'l' => chr(0xc3).chr(0xba),
546	'\'n' => chr(0xc3).chr(0x84),
547	'\'o' => chr(0xc3).chr(0xb3),
548	'\'r' => chr(0xc5).chr(0x95),
549	'\'s' => chr(0xc5).chr(0x9b),
550	'\'u' => chr(0xc3).chr(0xba),
551	'\'y' => chr(0xc3).chr(0xbd),
552	'\'z' => chr(0xc5).chr(0xba),
553	# graves
554	'`a' => chr(0xc3).chr(0xa0),
555	'`A' => chr(0xc3).chr(0x80),
556	'`e' => chr(0xc3).chr(0xa8),
557	'`E' => chr(0xc3).chr(0x88),
558	'`i' => chr(0xc3).chr(0xac),
559	'`I' => chr(0xc3).chr(0x8c),
560	'`o' => chr(0xc3).chr(0xb2),
561	'`O' => chr(0xc3).chr(0x92),
562	'`u' => chr(0xc3).chr(0xb9),
563	'`U' => chr(0xc3).chr(0x99),
564	# circumflex
565	'^a' => chr(0xc3).chr(0xa2),
566	'^A' => chr(0xc3).chr(0x82),
567	'^c' => chr(0xc4).chr(0x89),
568	'^C' => chr(0xc4).chr(0x88),
569	'^e' => chr(0xc3).chr(0xaa),
570	'^E' => chr(0xc3).chr(0x8a),
571	'^g' => chr(0xc4).chr(0x9d),
572	'^G' => chr(0xc4).chr(0x9c),
573	'^h' => chr(0xc4).chr(0xa5),
574	'^H' => chr(0xc4).chr(0xa4),
575	'^i' => chr(0xc3).chr(0xae),
576	'^I' => chr(0xc3).chr(0x8e),
577	'^j' => chr(0xc4).chr(0xb5),
578	'^J' => chr(0xc4).chr(0xb4),
579	'^o' => chr(0xc3).chr(0xb4),
580	'^O' => chr(0xc3).chr(0x94),
581	'^s' => chr(0xc5).chr(0x9d),
582	'^S' => chr(0xc5).chr(0x9c),
583	'^u' => chr(0xc3).chr(0xa2),
584	'^U' => chr(0xc3).chr(0xbb),
585	'^w' => chr(0xc5).chr(0xb5),
586	'^W' => chr(0xc5).chr(0xb4),
587	'^y' => chr(0xc5).chr(0xb7),
588	'^Y' => chr(0xc5).chr(0xb6),
589
590	# diaeresis
591	'"a' => chr(0xc3).chr(0xa4),
592	'"A' => chr(0xc3).chr(0x84),
593	'"e' => chr(0xc3).chr(0xab),
594	'"E' => chr(0xc3).chr(0x8b),
595	'"\\\\i' => chr(0xc3).chr(0xaf),
596	'"\\\\I' => chr(0xc3).chr(0x8f),
597	'"o' => chr(0xc3).chr(0xb6),
598	'"O' => chr(0xc3).chr(0x96),
599	'"u' => chr(0xc3).chr(0xbc),
600	'"U' => chr(0xc3).chr(0x9c),
601	'"y' => chr(0xc3).chr(0xbf),
602	'"Y' => chr(0xc3).chr(0xb8),
603	# tilde
604	'~A' => chr(0xc3).chr(0x83),
605	'~N' => chr(0xc3).chr(0x91),
606	'~O' => chr(0xc3).chr(0x95),
607	'~a' => chr(0xc3).chr(0xa3),
608	'~n' => chr(0xc3).chr(0xb1),
609	'~o' => chr(0xc3).chr(0xb5),
610	# caron - handled specially
611	# double acute
612	# ring
613	# dot
614	'.c' => chr(0xc4).chr(0x8b),
615	'.C' => chr(0xc4).chr(0x8a),
616	'.e' => chr(0xc4).chr(0x97),
617	'.E' => chr(0xc4).chr(0x96),
618	'.g' => chr(0xc4).chr(0xa1),
619	'.G' => chr(0xc4).chr(0xa0),
620	'.I' => chr(0xc4).chr(0xb0),
621	'.z' => chr(0xc5).chr(0xbc),
622	'.Z' => chr(0xc5).chr(0xbb),
623	# macron
624	'=a' => chr(0xc4).chr(0x81),
625	'=A' => chr(0xc4).chr(0x80),
626	'=e' => chr(0xc4).chr(0x93),
627	'=E' => chr(0xc4).chr(0x92),
628	'=i' => chr(0xc4).chr(0xab),
629	'=I' => chr(0xc4).chr(0xaa),
630	'=o' => chr(0xc4).chr(0x8d),
631	'=O' => chr(0xc4).chr(0x8c),
632	'=u' => chr(0xc4).chr(0xab),
633	'=U' => chr(0xc4).chr(0xaa),
634
635	# stroke - handled specially - see below
636
637	# cedilla - handled specially
638	);
639
640	# these are one letter latex commands - we make sure they're not a longer
641	# command name. eg {\d} is d+stroke, so careful of \d
642	my %special_utf8_chars =
643	(
644	# breve
645	'u g' => chr(0xc4).chr(0x9f),
646	'u G' => chr(0xc4).chr(0x9e),
647	'u i' => chr(0xc4).chr(0xad),
648	'u I' => chr(0xc4).chr(0xac),
649	'u o' => chr(0xc5).chr(0x8f),
650	'u O' => chr(0xc5).chr(0x8e),
651	'u u' => chr(0xc5).chr(0xad),
652	'u U' => chr(0xc5).chr(0xac),
653	# caron
654	'v c' => chr(0xc4).chr(0x8d),
655	'v C' => chr(0xc4).chr(0x8c),
656	'v n' => chr(0xc5).chr(0x88),
657	'v N' => chr(0xc5).chr(0x87),
658	'v s' => chr(0xc5).chr(0xa1),
659	'v S' => chr(0xc5).chr(0xa5),
660	'v z' => chr(0xc5).chr(0xbe),
661	'v Z' => chr(0xc5).chr(0xbd),
662	# cedilla
663	'c c' => chr(0xc3).chr(0xa7),
664	'c C' => chr(0xc3).chr(0x87),
665	'c g' => chr(0xc4).chr(0xa3),
666	'c G' => chr(0xc4).chr(0xa2),
667	'c k' => chr(0xc4).chr(0xb7),
668	'c K' => chr(0xc4).chr(0xb6),
669	'c l' => chr(0xc4).chr(0xbc),
670	'c L' => chr(0xc4).chr(0xbb),
671	'c n' => chr(0xc5).chr(0x86),
672	'c N' => chr(0xc5).chr(0x85),
673	'c r' => chr(0xc5).chr(0x97),
674	'c R' => chr(0xc5).chr(0x96),
675	'c s' => chr(0xc5).chr(0x9f),
676	'c S' => chr(0xc5).chr(0x9e),
677	'c t' => chr(0xc5).chr(0xa3),
678	'c T' => chr(0xc5).chr(0xa2),
679	# double acute / Hungarian accent
680	'H O' => chr(0xc5).chr(0x90),
681	'H o' => chr(0xc5).chr(0x91),
682	'H U' => chr(0xc5).chr(0xb0),
683	'H u' => chr(0xc5).chr(0xb1),
684
685	# stroke
686	'd' => chr(0xc4).chr(0x91),
687	'D' => chr(0xc4).chr(0x90),
688	'h' => chr(0xc4).chr(0xa7),
689	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
690	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
691	'l' => chr(0xc5).chr(0x82),
692	'L' => chr(0xc5).chr(0x81),
693	'o' => chr(0xc3).chr(0xb8),
694	'O' => chr(0xc3).chr(0x98),
695	't' => chr(0xc5).chr(0xa7),
696	'T' => chr(0xc5).chr(0xa6),
697	# german ss/szlig/sharp s
698	'ss' => chr(0xc3).chr(0x9f),
699	);
700
701	# convert latex-style accented characters.
702
703	# remove space (if any) between \ and letter to accent (eg {\' a})
704
705	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
706
707	# remove {} around a single character (eg \'{e})
708	$text =~ s@(\\[`'="^~\.])\{(\w)}@{$1$2}@g; #`
709
710	# \, is another way of doing cedilla \c
711	$text =~ s@\\,(.)@\\c $1@g;
712
713	# remove {} around a single character for special 1 letter commands -
714	# need to insert a space. Eg \v{s} -> {\v s}
715	$text =~ s@(\\[uvcH])\{(\w)}@{$1 $2}@g;
716
717	# only do if the text contains a '\' character.
718	if ($text =~ m\|\\\|) {
719	# "normal" accents - ie non-alpha latex tag
720	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
721	my $tex="$1$2"; my $char="$2";
722	my $replacement=$utf8_chars{$tex};
723	if (!defined($replacement)) {
724	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
725	$replacement=$char;
726	}
727	$text =~ s/\\\Q$tex/$replacement/g;
728	}
729
730	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
731	# only do the change if immediately followed by a space, }, {, or \
732	# one letter accents ( + ss)
733	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
734	my $tex=$1;
735	my $replacement=$special_utf8_chars{$tex};
736	if (!defined($replacement)) {
737	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
738	$replacement=$tex;
739	}
740	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
741
742	}
743
744	# one letter latex accent commands that affect following letter
745	while ($text =~ m@\\([uvcH]) ([\w])@) {
746	my $tex="$1 $2"; my $char="$2";
747	my $replacement=$special_utf8_chars{$tex};
748	if (!defined($replacement)) {
749	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
750	$replacement=$char;
751	}
752	$text =~ s/\\$tex/$replacement/g;
753	}
754	}
755
756	# escape html-sensitive characters
757	$text =~ s@&@&@g;
758	$text =~ s@<@<@g;
759	$text =~ s@>@>@g;
760	$text =~ s/''/"/g; # Latex-specific
761	$text =~ s/``/"/g; # Latex-specific
762	# greenstone-specific
763	$text =~ s@\[@&\#91;@g;
764	$text =~ s@\]@&\#93;@g;
765
766	# remove latex commands
767
768	# explicitly recognised commands
769	$text =~ s@\\ldots@…@g;
770
771	# maths mode
772	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
773
774	# remove all other commands with optional arguments...
775	$text =~ s@\\\w+(\[.?\])?\s@@g;
776	# $text =~ s@\\noopsort{[^}]+\}@@g;
777	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
778
779	# remove latex groupings { } (but not \{ or \} )
780	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
781	$text =~ s/^\{//; # remove { if first char
782
783	# latex characters
784	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
785	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
786	# optional break "\-"
787	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
788	# the non-macro bits have quotes around them - we just remove quotes
789	$text =~ s/([^&])[\"\#]/$1/g;
790	}
791	# dashes. Convert (m\|n)-dash into single dash for html.
792	$text =~ s@\-\-+@\-@g;
793
794	# quoted { } chars
795	$text =~ s@\\\{@{@g;
796	$text =~ s@\\}@}@g;
797
798	# finally to protect against macro language...
799	$text =~ s@\\@\\\\@g;
800
801	return $text;
802	}
803
804
805	sub process_latex_math {
806	my $text = shift;
807
808	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
809	$text =~ s@\^\{(.*?)}@<sup>$1</sup>@g; # superscript
810	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
811	$text =~ s@\_\{(.*?)}@<sub>$1</sub>@g; # subscript
812	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
813
814	# put all other command names in italics
815	$text =~ s@\\([\w]+)@<i>$1</i>@g;
816
817	return $text;
818	}
819
820	sub add_segment_OID {
821	my $self = shift (@_);
822	my ($doc_obj, $id, $segment_number) = @_;
823	if ( $self->{'key'} eq "default") {
824	$self->SUPER::add_segment_OID(@_);
825	} else {
826	$doc_obj->set_OID($self->{'key'});
827	}
828	}
829
830	1;
831
832

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BibTexPlugin.pm

Download in other formats: