Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3112

Last change on this file since 3112 was 3112, checked in by jrm21, 22 years ago
minor changes to formatted values (eg if enclosed in { and } ) and added a few extra accented characters.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.9 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
	3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
[2901]	10	# Copyright 1999-2001 New Zealand Digital Library Project
[1676]	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
	29	# BibTexPlug reads bibliography files in BibTex format.
	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
	34	# BibTexPlug creates a document object for every reference a the file.
	35	# It is a subclass of SplitPlug, so if there are multiple records, all
	36	# are read.
[2901]	37	#
	38	# Modified Dec 2001 by John McPherson:
	39	# * some modifications submitted by Sergey Yevtushenko
	40	# <[email protected]>
	41	# * some non-ascii char support (ie mostly Latin)
	42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
[1676]	43
	44	package BibTexPlug;
	45
	46	use SplitPlug;
	47
	48	# BibTexPlug is a sub-class of BasPlug.
	49	sub BEGIN {
	50	@ISA = ('SplitPlug');
	51	}
	52
	53	# This plugin processes files with the suffix ".bib"
	54	sub get_default_process_exp {
	55	return q^(?i)\.bib$^;
	56	}
	57
	58	# This plugin splits the input text at blank lines
	59	sub get_default_split_exp {
	60	return q^\n+(?=@)^;
	61	}
	62
[2901]	63
	64
[1676]	65	# The process function reads a single bibliographic record and stores
	66	# it as a new document.
	67
	68	sub process {
	69	my $self = shift (@_);
	70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	71	my $outhandle = $self->{'outhandle'};
	72
[2484]	73	$self->{'key'} = "default";
	74
[1676]	75	# Check that we're dealing with a valid BibTex record
	76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	77
	78	# Ignore things we can't use
	79	return 0 if ($$textref =~ /^\@String/);
	80
	81	# Report that we're processing the file
	82	print $outhandle "BibTexPlug: processing $file\n"
	83	if ($self->{'verbosity'}) > 1;
	84
	85
[1711]	86	# This hash translates BibTex field names into metadata names. The
[1676]	87	# BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]	88	# McKinstry. Metadata names are consistent with ReferPlug.
[1676]	89
	90	my %field = (
	91	'address', 'PublisherAddress',
	92	'author', 'Creator',
[2901]	93
[1676]	94	'booktitle', 'Booktitle',
	95	'chapter', 'Chapter',
	96	'edition', 'Edition',
[2901]	97	'editor', 'Editor',
[1676]	98	'institution', 'Publisher',
	99	'journal', 'Journal',
	100	'month', 'Month',
	101	'number', 'Number',
	102	'pages', 'Pages',
	103	'publisher', 'Publisher',
	104	'school', 'Publisher',
	105	'title', 'Title',
	106	'volume', 'Volume',
	107	'year', 'Date',
	108
	109	'keywords', 'Keywords',
	110	'abstract', 'Abstract',
[2901]	111	'copyright', 'Copyright'
	112	);
[1676]	113
	114	# Metadata fields
	115	my %metadata;
	116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
	117
[2901]	118	my $verbosity = $self->{'verbosity'};
	119	$verbosity = 0 unless $verbosity;
	120
	121	my $lines=$$textref;
	122
[1676]	123	# Make sure the text has exactly one entry per line
[2901]	124
	125	$lines =~ s/^\s(\@[^,]+,)\s\n/$1=====/; #splitting key in entry
	126	$lines =~ s/([\"\}]\s,)\s\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
	127	$lines =~ s/(\d+\s\,)\s\n/$1=====/g; #for the case, when we have number entry without closing "
	128	$lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside quoted strings,
	129	#that continues for several lines
[1676]	130	$lines =~ s/\s+/ /g;
	131	$lines =~ s/\s=====\s/\n/g;
[2901]	132
	133	my @all_lines = split(/\n+/, $lines);
	134
[1676]	135	# Read and process each line in the bib file.
[2901]	136	my ($entryname, $name, $value, $line);
	137	foreach $line (@all_lines) {
	138
[1676]	139	# Add each line. Most lines consist of a field identifer and
	140	# then data, and we simply store them, though we treat some
	141	# of the fields a bit differently.
	142
	143	$line =~ s/\s+/ /g;
	144	$text .= "$line\n";
	145
[2901]	146	print "Processing line = $line \n" if $verbosity>=4;
	147
[1676]	148	# The first line is special, it contains the reference type and OID
[2484]	149	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
[1676]	150	$EntryType = $1;
	151	$EntryID = $2;
[2484]	152	print "** $EntryType - \"$EntryID\" \n"
	153	if ($verbosity >= 4);
	154	$self->{'key'} = $EntryID;
[1676]	155	next;
	156	}
[2484]	157	if ($line =~ /\@/) {
[2901]	158	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]	159	if ($verbosity >= 2);
[2901]	160	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]	161	if ($verbosity >= 2);
	162	}
[1676]	163
	164	# otherwise, parse the metadata out of this line
	165	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
[2901]	166	$entryname = lc($1);
[1676]	167	$value = $2;
[3112]	168	$value =~ s/,?\s*$//; # remove trailing comma and space
	169	if ($value =~ /^"/ && $value =~ /"$/) {
	170	# remove surrounding " marks
	171	$value =~ s/^"//; $value =~ s/"$//;
	172	}
[2901]	173	$value = &process_latex($value);
	174
[1676]	175	# Add this line of metadata
[2901]	176	$metadata{$entryname} .= "$value\n";
	177
[1676]	178	}
	179
[1677]	180	# Add the Entry type as metadata
[2901]	181	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[1676]	182
	183	# Add the various field as metadata
[2901]	184	foreach my $entryname (keys %metadata) {
	185	next unless (defined $field{$entryname});
	186	next unless (defined $metadata{$entryname});
[1676]	187
[2901]	188	$name = $field{$entryname};
	189	$value = $metadata{$entryname};
[1676]	190
[2901]	191	# Add the various fields as metadata
	192	my $html_value = &text_into_html($value);
	193	$doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
[1676]	194
	195	# Several special operatons on metadata follow
	196
	197	# Add individual keywords.
	198	# The full set of keywords will be added, in due course, as "Keywords".
	199	# However, we also want to add them as individual "Keyword" metadata elements.
[2901]	200	if ($entryname eq "keywords") {
[1676]	201	my @keywordlist = split(/,/, $value);
	202	foreach my $k (@keywordlist) {
[2901]	203	$k = lc($k);
[1676]	204	$k =~ s/\s*$//;
	205	$k =~ s/^\s*//;
	206	if ($k =~ /\w/) {
	207	$k = &text_into_html($k);
[2901]	208	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]	209	}
	210	}
	211	}
	212
	213	# Add individual authors
	214	# The author metadata will be stored as one "Creator" entry, but we
	215	# also want to split it into several individual "Author" fields in
[3112]	216	# "Lastname, Firstnames" format so we can browse it.
[2901]	217	if ($entryname eq "author") { #added also comparison with editor
	218
	219	# und here for german language...
	220	# don't use brackets in pattern, else the matched bit becomes
	221	# an element in the list!
	222	my @authorlist = split(/,\|\s+and\s+\|\s+und\s+/, $value);
[1676]	223	foreach $a (@authorlist) {
	224	$a =~ s/\s*$//;
	225	$a =~ s/^\s*//;
	226	# Reformat and add author name
[2901]	227	next if $a=~ /^\s*$/;
[1676]	228	my @words = split(/ /, $a);
	229	my $lastname = pop @words;
	230	my $firstname = join(" ", @words);
[2901]	231
[1676]	232	my $fullname = $lastname . ", " . $firstname;
[2901]	233
[1676]	234	# Add each name to set of Authors
[2901]	235	# force utf8 pragma so that \w matches in this scope
	236	use utf8;
[1676]	237	if ($fullname =~ /\w+, \w+/) {
[2901]	238	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]	239	}
	240	}
	241	}
	242
	243	# Books and Journals are additionally marked for display purposes
[2901]	244	if ($entryname eq "booktitle") {
	245	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	246	} elsif ($entryname eq "journal") {
	247	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]	248	}
	249
	250	}
	251
	252	# Add the text in BibTex format (all fields)
	253	if ($text =~ /\w/) {
	254	$text = &text_into_html($text);
[2901]	255	$doc_obj->add_utf8_text ($cursection, $text);
	256	$doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
[1676]	257	}
	258
[1677]	259	return 1;
[1676]	260	}
	261
[1677]	262
[2901]	263
	264
	265	# convert email addresses and URLs into links
	266	sub convert_urls_into_links{
	267	my ($text) = @_;
	268
	269	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
	270	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
	271
	272	return $text;
	273	}
	274
	275	# Clean up whitespace and convert \n charaters to <BR> or <P>
	276	sub clean_up_whitespaces{
	277	my ($text) = @_;
	278
	279	$text =~ s/%%%%%/<BR> <BR>/g;
	280	$text =~ s/ +/ /g;
	281	$text =~ s/\s*$//;
	282	$text =~ s/^\s*//;
	283	$text =~ s/\n/\n<BR>/g;
	284	$text =~ s/<BR>\s*<BR>/<P>/g;
	285
	286	return $text;
	287	}
	288
	289
	290	sub convert_problem_characters_without_ampersand{
	291	my ($text) = @_;
	292	$text =~ s/</</g;
	293	$text =~ s/>/>/g;
	294
	295	$text =~ s/\'\'/\"/g; #Latex -specific conversion
	296	$text =~ s/\`\`/\"/g; #Latex -specific conversion
	297
	298
	299	$text =~ s/\"/"/g;
	300	$text =~ s/\'/’/g;
	301	$text =~ s/\`/‘/g;
	302	$text =~ s/\+/ /g;
	303	$text =~ s/\(/ /g;
	304	$text =~ s/\)/ /g;
	305
	306	$text =~ s/\\/\\\\/g;
	307
	308	$text =~ s/\./\\\./g;
	309
	310	return $text;
	311	}
	312
[1676]	313	# Convert a text string into HTML.
	314
[1677]	315	# The HTML is going to be inserted into a GML file, so we have to be
[2901]	316	# careful not to use symbols like ">", which occurs frequently in email
[1677]	317	# messages (and use &gt instead.
	318
	319	# This function also turns URLs and email addresses into links, and
	320	# replaces carriage returns with <BR> tags (and multiple carriage returns
	321	# with <P> tags).
	322
[1676]	323	sub text_into_html {
	324	my ($text) = @_;
	325
[2901]	326	# Convert problem characters into HTML symbols
[1676]	327	$text =~ s/&/&/g;
	328
[2901]	329	$text = &convert_problem_characters_without_ampersand( $text );
	330
[1676]	331	# convert email addresses and URLs into links
[2901]	332	$text = &convert_urls_into_links( $text );
[1676]	333
[2901]	334	$text = &clean_up_whitespaces( $text );
[1676]	335
	336	return $text;
	337	}
	338
[2901]	339
	340
	341
	342	# Convert accented characters, remove { }, interprete some commands....
	343	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	344	sub process_latex {
	345	my ($text) = @_;
	346
	347	# note - this is really ugly, but it works. There may be a prettier way
	348	# of mapping latex accented chars to utf8, but we just brute force it here.
	349	# Also, this isn't complete - not every single possible accented letter
	350	# is in here yet, but most of the common ones are.
	351
	352	my %utf8_chars =
	353	(
	354	# acutes
	355	'\'a' => chr(0xc3).chr(0xa1),
	356	'\'c' => chr(0xc4).chr(0x87),
	357	'\'e' => chr(0xc3).chr(0xa9),
	358	'\'i' => chr(0xc3).chr(0xad),
	359	'\'l' => chr(0xc3).chr(0xba),
	360	'\'n' => chr(0xc3).chr(0x84),
	361	'\'o' => chr(0xc3).chr(0xb3),
	362	'\'r' => chr(0xc5).chr(0x95),
	363	'\'s' => chr(0xc5).chr(0x9b),
	364	'\'u' => chr(0xc3).chr(0xba),
	365	'\'y' => chr(0xc3).chr(0xbd),
	366	'\'z' => chr(0xc5).chr(0xba),
	367	# graves
	368	'`a' => chr(0xc3).chr(0xa0),
	369	'`A' => chr(0xc3).chr(0x80),
	370	'`e' => chr(0xc3).chr(0xa8),
	371	'`E' => chr(0xc3).chr(0x88),
	372	'`i' => chr(0xc3).chr(0xac),
	373	'`I' => chr(0xc3).chr(0x8c),
	374	'`o' => chr(0xc3).chr(0xb2),
	375	'`O' => chr(0xc3).chr(0x92),
	376	'`u' => chr(0xc3).chr(0xb9),
	377	'`U' => chr(0xc3).chr(0x99),
	378	# circumflex
	379	'^a' => chr(0xc3).chr(0xa2),
	380	'^A' => chr(0xc3).chr(0x82),
	381	'^c' => chr(0xc4).chr(0x89),
	382	'^C' => chr(0xc4).chr(0x88),
	383	'^e' => chr(0xc3).chr(0xaa),
	384	'^E' => chr(0xc3).chr(0x8a),
	385	'^g' => chr(0xc4).chr(0x9d),
	386	'^G' => chr(0xc4).chr(0x9c),
	387	'^h' => chr(0xc4).chr(0xa5),
	388	'^H' => chr(0xc4).chr(0xa4),
	389	'^i' => chr(0xc3).chr(0xae),
	390	'^I' => chr(0xc3).chr(0x8e),
	391	'^j' => chr(0xc4).chr(0xb5),
	392	'^J' => chr(0xc4).chr(0xb4),
	393	'^o' => chr(0xc3).chr(0xb4),
	394	'^O' => chr(0xc3).chr(0x94),
	395	'^s' => chr(0xc5).chr(0x9d),
	396	'^S' => chr(0xc5).chr(0x9c),
	397	'^u' => chr(0xc3).chr(0xa2),
	398	'^U' => chr(0xc3).chr(0xbb),
	399	'^w' => chr(0xc5).chr(0xb5),
	400	'^W' => chr(0xc5).chr(0xb4),
	401	'^y' => chr(0xc5).chr(0xb7),
	402	'^Y' => chr(0xc5).chr(0xb6),
	403
	404	# diaeresis
	405	'"a' => chr(0xc3).chr(0xa4),
	406	'"A' => chr(0xc3).chr(0x84),
	407	'"e' => chr(0xc3).chr(0xab),
	408	'"E' => chr(0xc3).chr(0x8b),
	409	'"\\\\i' => chr(0xc3).chr(0xaf),
	410	'"\\\\I' => chr(0xc3).chr(0x8f),
	411	'"o' => chr(0xc3).chr(0xb6),
	412	'"O' => chr(0xc3).chr(0x96),
	413	'"u' => chr(0xc3).chr(0xbc),
	414	'"U' => chr(0xc3).chr(0x9c),
	415	'"y' => chr(0xc3).chr(0xbf),
	416	'"Y' => chr(0xc3).chr(0xb8),
	417	# tilde
	418	# caron - handled specially
	419	# ',s' => chr(0xc5).chr(0xa1),
	420	# ',S' => chr(0xc5).chr(0xa5),
	421	# breve
	422	# double acute
	423	# ring
	424	# dot
[3112]	425	'\.c' => chr(0xc4).chr(0x8b),
	426	'\.C' => chr(0xc4).chr(0x8a),
	427	'\.e' => chr(0xc4).chr(0x97),
	428	'\.E' => chr(0xc4).chr(0x96),
	429	'\.g' => chr(0xc4).chr(0xa1),
	430	'\.G' => chr(0xc4).chr(0xa0),
	431	'\.I' => chr(0xc4).chr(0xb0),
	432	'\.z' => chr(0xc5).chr(0xbc),
	433	'\.Z' => chr(0xc5).chr(0xbb),
[2901]	434	# macron
	435	'=a' => chr(0xc4).chr(0x81),
	436	'=A' => chr(0xc4).chr(0x80),
	437	'=e' => chr(0xc4).chr(0x93),
	438	'=E' => chr(0xc4).chr(0x92),
	439	'=i' => chr(0xc4).chr(0xab),
	440	'=I' => chr(0xc4).chr(0xaa),
	441	'=o' => chr(0xc4).chr(0x8d),
	442	'=O' => chr(0xc4).chr(0x8c),
	443	'=u' => chr(0xc4).chr(0xab),
	444	'=U' => chr(0xc4).chr(0xaa),
	445
	446	# stroke - handled specially - see below
	447
	448	# cedilla - handled specially
	449
	450	);
	451
	452	# these are one letter latex commands - we make sure they're not a longer
	453	# command name. eg {\d} is d+stroke, so careful of \d
	454	my %special_utf8_chars =
	455	(
	456	# caron
	457	'v n' => chr(0xc5).chr(0x88),
	458	'v N' => chr(0xc5).chr(0x87),
	459	'v s' => chr(0xc5).chr(0xa1),
	460	'v S' => chr(0xc5).chr(0xa5),
	461	# cedilla
	462	'c c' => chr(0xc3).chr(0xa7),
	463	'c C' => chr(0xc3).chr(0x87),
	464	'c g' => chr(0xc4).chr(0xa3),
	465	'c G' => chr(0xc4).chr(0xa2),
	466	'c k' => chr(0xc4).chr(0xb7),
	467	'c K' => chr(0xc4).chr(0xb6),
	468	'c l' => chr(0xc4).chr(0xbc),
	469	'c L' => chr(0xc4).chr(0xbb),
	470	'c n' => chr(0xc5).chr(0x86),
	471	'c N' => chr(0xc5).chr(0x85),
	472	'c r' => chr(0xc5).chr(0x97),
	473	'c R' => chr(0xc5).chr(0x96),
	474	'c s' => chr(0xc5).chr(0x9f),
	475	'c S' => chr(0xc5).chr(0x9e),
	476	'c t' => chr(0xc5).chr(0xa3),
	477	'c T' => chr(0xc5).chr(0xa2),
	478	# double acute / Hungarian accent
	479	'H O' => chr(0xc5).chr(0x90),
	480	'H o' => chr(0xc5).chr(0x91),
	481	'H U' => chr(0xc5).chr(0xb0),
	482	'H u' => chr(0xc5).chr(0xb1),
	483
	484	# stroke
	485	'd' => chr(0xc4).chr(0x91),
	486	'D' => chr(0xc4).chr(0x90),
	487	'h' => chr(0xc4).chr(0xa7),
	488	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]	489	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]	490	'l' => chr(0xc5).chr(0x82),
	491	'L' => chr(0xc5).chr(0x81),
	492	'o' => chr(0xc3).chr(0xb8),
	493	'O' => chr(0xc3).chr(0x98),
	494	't' => chr(0xc5).chr(0xa7),
	495	'T' => chr(0xc5).chr(0xa6),
	496	# german ss/szlig/sharp s
	497	'ss' => chr(0xc3).chr(0x9f),
	498	);
	499
	500	# convert latex-style accented characters.
	501	# remove space (if any) between \ and letter to accent (eg {\' a})
	502
	503	$text =~ s@(\\[`'="])\s(\w)@$1$2@g;
	504
	505	# remove {} around a single character (eg \'{e})
[3112]	506	$text =~ s@(\\[`'="\.]){(\w)}@$1$2@g;
[2901]	507
	508	# remove {} around a single character for special 1 letter commands -
	509	# need to insert a space. Eg \v{s} -> {\v s}
	510	$text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
	511
	512	# this is slow (go through whole hash for each substitution!) so
	513	# only do if the text contains a '\' character.
	514	if ($text =~ m\|\\\|) {
	515	for $latex_code (keys %utf8_chars) {
	516	$text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
	517	}
	518
	519	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	520	# only do the change if immediately followed by a space, }, {, or \
	521	for $latex_code (keys %special_utf8_chars) {
	522	$text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
	523	}
	524	}
	525
[3112]	526	# remove latex commands
	527	$text =~ s@\\\w+{([^}]*)}@$1@g;
	528
[2901]	529	# remove latex groupings { } (but not \{ or \} )
[3112]	530	$text =~ s/([^\\])\{/$1/g;
	531	$text =~ s/([^\\])\}/$1/g;
	532	$text =~ s/^{//; # remove { if first char
[2901]	533
	534	# maths mode $...$ - this is not interpreted in any way at the moment...
	535	$text =~ s@\$(.*)\$@$1@g;
	536
	537	# quoted { } chars
	538	$text =~ s@\\{@{@g;
	539	$text =~ s@\\}@}@g;
	540
	541	return $text;
	542	}
	543
	544
[2484]	545	sub set_OID {
	546	my $self = shift (@_);
	547	my ($doc_obj, $id, $segment_number) = @_;
	548
	549	if ( $self->{'key'} eq "default") {
	550	$doc_obj->set_OID();
	551	} else {
	552	$doc_obj->set_OID($self->{'key'});
	553	}
	554	}
[1676]	555
[2484]	556	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: