source: trunk/gsdl/perllib/acronym.pm@ 1393

Last change on this file since 1393 was 1393, checked in by say1, 24 years ago

acronym markup functionality

  • Property svn:keywords set to Author Date Id Revision
File size: 16.8 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28use strict;
29use diagnostics;
30
31package acronym;
32
33###########################################################################
34# global variables
35###########################################################################
36# valiables to control the recall/precision tradeoff
37
38#the maximum range to look for acronyms
39my $max_offset = 30;
40#acronyms must be upper case
41my $upper_case = 1;
42#acronym case must match
43my $case_match = 1;
44#minimum acronym length
45my $min_def_length = 3;
46#minimum acronym length
47my $min_acro_length = 3;
48#minimum acronym length saving
49my $min_length_saving = 4;
50#allow recusive acronyms
51my $allow_recursive = "";
52
53my @stop_words = split / /, "OF AT THE IN TO AND";
54
55#the text split into an array, one word per element
56my @split_text = ();
57my @acronym_list = ();
58
59#the file to collate acronyms into
60my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
61my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
62
63my %acronyms_found_in_collection = ();
64my %acronyms_banned_from_collection = ();
65
66my $writing_acronyms = 1;
67my $accumulate_acronyms = 1;
68my $markup_accumulate_acronyms = 1;
69my $markup_local_acronyms = 1;
70
71
72
73###########################################################################
74# file saving / loading stuff
75###########################################################################
76
77sub init_acronyms {
78
79 my $file_text = "";
80 if (open ACRONYM_HANDLE, "<$acronym_options_file")
81 {
82 $file_text = do { local $/; <ACRONYM_HANDLE> };
83 }
84 if ($file_text eq "")
85 {
86 print STDERR "failed to open $acronym_options_file\n";
87 open ACRONYM_HANDLE, ">$acronym_options_file\n";
88 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
89 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
90 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
91 print ACRONYM_HANDLE "#valid here.\n\n";
92 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
93 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
94 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
95 print ACRONYM_HANDLE "#explain them fully \n\n";
96 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
97 print ACRONYM_HANDLE "\$max_offset = 30;\n\n";
98 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n";
99 print ACRONYM_HANDLE "\$upper_case = 1;\n\n";
100 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n";
101 print ACRONYM_HANDLE "\$case_match = 1;\n\n";
102 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
103 print ACRONYM_HANDLE "\$min_def_length = 3;\n\n";
104 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
105 print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n";
106 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
107 print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n";
108 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n";
109 print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n";
110 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
111 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
112 print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
113 print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n";
114 print ACRONYM_HANDLE "\n";
115 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
116 print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n";
117 print ACRONYM_HANDLE "\n";
118 print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n";
119 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
120 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
121 print ACRONYM_HANDLE "# \n";
122 print ACRONYM_HANDLE "# \n";
123 print ACRONYM_HANDLE "# \n";
124 }
125 eval $file_text;
126 #promotes warnings/errors from evaluated file to current context
127 warn $@ if $@;
128
129
130
131 &read_all_acronyms_from_file();
132# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
133# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
134 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
135 {
136 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
137 print ACRONYM_HANDLE "#\n";
138 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
139 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
140 print ACRONYM_HANDLE "#read-only. \n";
141 print ACRONYM_HANDLE "#\n";
142 print ACRONYM_HANDLE "#start of acronyms...\n";
143 $writing_acronyms = 1;
144 } else {
145 warn "failed to open $acronym_accumulate_file for writing\n";
146 $writing_acronyms = 0;
147 }
148}
149
150#close the list of accumulated acronyms
151END {
152 if ($writing_acronyms)
153 {
154 print ACRONYM_HANDLE "#end of acronyms.\n";
155 close ACRONYM_HANDLE;
156 }
157}
158
159#eval a file of accumulated acronyms
160sub read_all_acronyms_from_file {
161
162 my $file_text = "";
163 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
164 {
165 $file_text = do { local $/; <ACRONYM_HANDLE> };
166 } else {
167 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
168 }
169 eval $file_text;
170 #promotes warnings/errors from evaluated file to current context
171 warn $@ if $@;
172}
173
174#called from within the file of accumulated acronyms to indicate a good acronym
175sub add {
176 my $self = shift (@_);
177 if (defined ($acronyms_found_in_collection{$self->[0]}))
178 {
179 my $def = $self->to_def_string();
180 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
181 {
182 return;
183 }
184 $acronyms_found_in_collection{$self->[0]} =
185 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
186 } else {
187 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
188 }
189}
190
191#called from within the file of accumulated acronyms to indicate a bad acronym
192sub ban {
193 my $self = shift (@_);
194
195 if (!defined $acronyms_banned_from_collection{$self->[0]})
196 {
197 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
198 } else {
199 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
200 }
201}
202
203
204#write a good acronym to the accumulated acronyms file
205sub write_to_file {
206 my $self = shift (@_);
207 if ($writing_acronyms)
208 {
209 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
210 $self->to_def_string() .
211 "\")->add();\n";
212 }
213}
214
215
216###########################################################################
217# mark functionality
218###########################################################################
219
220#small routine to sort by length
221sub sort_by_length {
222 length($b) <=> length($a) or $a cmp $b
223}
224
225sub markup_acronyms {
226 my $text = shift (@_);
227 my $verbosity_obj = shift (@_);
228 if (defined $text)
229 {
230 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
231 {
232 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
233 printf STDERR " " . $acro . ","
234 if ($verbosity_obj->{'verbosity'} >= 2);
235 }
236 }
237 return $text;
238}
239
240
241
242###########################################################################
243# member functions
244###########################################################################
245
246
247sub new {
248 my $trash = shift (@_);
249 my $acro = shift (@_);
250 my $def = shift (@_);
251
252 my $self = [
253 "", # 0 acronym
254 [], # 1 definition
255 ];
256
257 $self->[0] = $acro if defined $acro;
258 push @{$self->[1]}, split / /, $def if defined $def;
259
260 bless $self;
261}
262
263sub clone {
264 my $self = shift (@_);
265
266 my $copy = new acronym;
267
268 $copy->[0] = $self->[0];
269 push @{$copy->[1]}, @{$self->[1]};
270 bless $copy;
271
272 return $copy;
273}
274
275#return the acronym
276sub to_acronym {
277 my $self = shift (@_);
278 my @array = @{$self->[1]};
279
280 return $self->[0];
281}
282
283#return the number of words in the acronym definition
284sub words_in_acronym_definition {
285 my $self = shift (@_);
286 my @array = @{$self->[1]};
287
288 return $#array + 1;
289}
290
291#return the number of letters in the acronym definition
292sub letters_in_acronym_definition {
293 my $self = shift (@_);
294
295 return length($self->to_def_string());
296}
297
298#return the number of letters in the acronym definition
299sub letters_in_acronym {
300 my $self = shift (@_);
301
302 return length($self->to_acronym());
303}
304
305#return the acronym definition
306sub to_def_string {
307 my $self = shift (@_);
308
309 my $result = "";
310
311 # do the definition
312 my @array = @{$self->[1]};
313 my $i = 0;
314 while ($i <= $#array)
315 {
316 $result = $result . $array[$i];
317
318 if ($i+1 <= $#array)
319 {
320 $result = $result . " ";
321 }
322 $i++;
323 }
324 return $result;
325}
326
327
328#print out the kwic for the acronym
329sub to_string_kwic {
330 my $self = shift (@_);
331
332 # the list of all possible combinations
333 my @list = ();
334
335 my $result = "";
336
337 my $j = 0;
338 my @array = @{$self->[1]};
339 while ($j <= $#array)
340 {
341
342 # do the definition
343 my $i = 0;
344
345 #add the key word
346 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
347
348 #add the proceeding words
349 while ($i < $j)
350 {
351 $result = $result . $array[$i] . " ";
352 $i++;
353 }
354 #add the key word
355 $result = $result . "</td><td halign=left>" . $array[$j] .
356 "</td><td halign=left>";
357
358 #add the trailing words
359 $i++;
360 while ($i <= $#array )
361 {
362 $result = $result . $array[$i] . " ";
363 $i++;
364 }
365
366 #add the actual acronym
367
368 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
369
370 push @list, $result;
371 $j++;
372 }
373 return @list;
374}
375
376#this is the one used when building the collection ...
377sub to_string {
378 my $self = shift (@_);
379
380 my $result = $self->[0] . " ";
381
382 # do the definition
383 my @array = @{$self->[1]};
384 my $i = 0;
385 while ($i <= $#array)
386 {
387 $result = $result . $array[$i];
388 if ($i+1 <= $#array)
389 {
390 $result = $result . " ";
391 }
392 $i++;
393 }
394 return $result;
395}
396
397sub check {
398 my $self = shift (@_);
399
400 if (length($self->to_acronym()) < $min_acro_length)
401 {
402# print "acronym " . $self->to_string() . " rejected (too short I)\n";
403 return 0;
404 }
405 if ($self->words_in_acronym_definition() < $min_def_length)
406 {
407# print "acronym " . $self->to_string() . " rejected (too short II)\n";
408 return 0;
409 }
410 if ($min_length_saving * $self->letters_in_acronym() >
411 $self->letters_in_acronym_definition())
412 {
413# print "acronym " . $self->to_string() . " rejected (too short III)\n";
414# print "" . $min_length_saving .
415# "|" . $self->letters_in_acronym() .
416# "|" . $self->letters_in_acronym_definition() . "\n";
417 return 0;
418 }
419# print "acronym " . $self->to_string() . " not rejected\n";
420 return 1;
421}
422
423###########################################################################
424# static functions
425###########################################################################
426
427sub recurse {
428 my ($acro_offset, #offset of word we're finding acronyms for
429 $text_offset,
430 $letter_offset,
431 @def_so_far) = @_;
432
433 my $word = $split_text[$text_offset];
434 my $acro = $split_text[$acro_offset];
435 $word = "" if !defined $word;
436 $acro = "" if !defined $acro;
437
438# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
439# $letter_offset . ", " . @def_so_far . ")\n";
440
441 #check for termination ...
442 if ($letter_offset >= length($acro))
443 {
444 my $acronym = new acronym();
445 $acronym->[0] = $acro;
446 push @{$acronym->[1]}, @def_so_far;
447 if ($acronym->check())
448 {
449 push @acronym_list, ( $acronym );
450 }
451# print "acronym created\n";
452 return;
453 }
454 #check for recursion
455 if (!$allow_recursive)
456 {
457 if ($word eq $acro)
458 {
459# print "recursion detected\n";
460 return;
461 }
462 }
463
464 #skip a stop-word
465 my $i = 0;
466 if ($letter_offset != 0)
467 {
468 while ($i <= $#stop_words)
469 {
470 if ($stop_words[$i] eq uc($word))
471 {
472# print "::found stop word::" . $stop_words[$i] . "\n";
473 &recurse($acro_offset,
474 $text_offset+1,
475 $letter_offset,
476 @def_so_far, $word);
477 }
478 $i++;
479 }
480 }
481 $i = 1;
482 #using the first $i letters ...
483 while ($letter_offset+$i <= length($acro) )
484 {
485# print "". ((substr $word, 0, $i) . " " .
486# (substr $acro, $letter_offset, $i) . "\n");
487 if (((!$case_match) &&
488 (uc(substr $word, 0, $i) eq
489 uc(substr $acro, $letter_offset, $i)))
490 ||
491 (($case_match) &&
492 ((substr $word, 0, $i) eq
493 (substr $acro, $letter_offset, $i))))
494 {
495# print "::match::\n";
496# print "" . ((substr $word, 0, $i) . " " .
497# (substr $acro, $letter_offset, $i) . "\n");
498 &recurse($acro_offset,
499 $text_offset+1,
500 $letter_offset+$i,
501 @def_so_far, $word);
502 } else {
503 return;
504 }
505 $i++;
506 }
507 return;
508}
509
510
511sub acronyms {
512 #clean up the text
513 my $processed_text = shift @_;
514 $$processed_text =~ s/[^A-Za-z]/ /g;
515 $$processed_text =~ s/\s+/ /g;
516
517 #clear some global variables
518 @split_text = ();
519 @acronym_list = ();
520
521 return &acronyms_from_clean_text($processed_text);
522}
523
524sub acronyms_from_clean_text {
525 my ($processed_text) = @_;
526
527 @split_text = split / /, $$processed_text;
528
529# my $i = 0;
530# while ($i <= $#split_text)
531# {
532# print $split_text[$i] . "\n";
533# $i++;
534# }
535
536 my $first = 0;
537 my $last = $#split_text +1;
538 my $acro_counter = $first;
539
540 while ($acro_counter < $last)
541 {
542 my $word = $split_text[$acro_counter];
543
544 if ((!$upper_case) ||
545 (uc($word) eq $word))
546 {
547
548 if (length $word >= $min_acro_length)
549 {
550 my $def_counter = 0;
551 if ($acro_counter - $max_offset > 0)
552 {
553 $def_counter = $acro_counter - $max_offset;
554 }
555 my $local_last = $acro_counter + $max_offset;
556 if ($local_last > $last)
557 {
558 $local_last = $last;
559 }
560 while ($def_counter <= $local_last)
561 {
562 &recurse($acro_counter,$def_counter,0,());
563 $def_counter++;
564 }
565 }
566 }
567 $acro_counter++;
568 }
569
570 return \@acronym_list;
571}
572
573
574
575sub test {
576
577# my $blarg = new acronym();
578# my $simple;
579# $simple = 10;
580# $blarg->initialise($simple, $simple, $simple);
581# my $blarg2 = $blarg->clone();
582# print $blarg->to_string();
583# print $blarg2;
584# print "\n";
585#
586 my $tla = new acronym();
587 $tla->[0] = "TLA";
588
589 my @array = ("Three", "Letter", "Acronym");
590# my $i = 0;
591# while ($i <= $#array)
592# {
593# print @array[$i] . "\n";
594# $i++;
595# }
596
597 print "\n";
598 push @{$tla->[1]}, ("Three" );
599 push @{$tla->[1]}, ("Letter" );
600 push @{$tla->[1]}, ("Acronym" );
601 print $tla->to_string(). "\n";
602 print "\n";
603 print "\n";
604 my $tla2 = $tla->clone();
605 push @{$tla2->[1]}, ("One");
606 push @{$tla2->[1]}, ("Two");
607 $tla2->[0] = "ALT";
608 print $tla->to_string(). "\n";
609 print $tla2->to_string(). "\n";
610
611 print "\n";
612 print "\n";
613
614 print "Testing recursion ...\n";
615 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
616
617 foreach my $acro (@$acros)
618 {
619 if ($acro->check)
620 {
621 print "accepted: " .$acro->to_string() . "\n";
622# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
623# "|" . $acro->words_in_acronym_definition() .
624# "|" . $acro->letters_in_acronym_definition() .
625# "|" . $acro->letters_in_acronym() . "|\n";
626 } else {
627# print "found but rejected: " .$acro->to_string() . "\n";
628 }
629 }
630}
631
632#uncomment this line to test this package
633#&test();
634
635&init_acronyms();
636
6371;
638
639
Note: See TracBrowser for help on using the repository browser.