source: trunk/gsdl/perllib/acronym.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1##########################################################################
2#
3# acronym.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25# class to handle acronyms
26###########################################################################
27
28use diagnostics;
29
30package acronym;
31
32###########################################################################
33# global variables
34###########################################################################
35# valiables to control the recall/precision tradeoff
36
37#the maximum range to look for acronyms
38my $max_offset = 30;
39#acronyms must be upper case
40my $upper_case = 1;
41#acronym case must match
42my $case_match = 1;
43#minimum acronym length
44my $min_def_length = 3;
45#minimum acronym length
46my $min_acro_length = 3;
47#minimum acronym length saving
48my $min_length_saving = 4;
49#allow recusive acronyms
50my $allow_recursive = 0;
51#let definitions be all capitals
52my $allow_all_caps = 0;
53
54my @stop_words = split / /, "OF AT THE IN TO AND";
55
56#the text split into an array, one word per element
57my @split_text = ();
58my @acronym_list = ();
59
60#the file to collate acronyms into
61my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
62my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
63
64my %acronyms_found_in_collection = ();
65my %acronyms_banned_from_collection = ();
66
67my $writing_acronyms = 1;
68my $accumulate_acronyms = 1;
69my $markup_accumulate_acronyms = 1;
70my $markup_local_acronyms = 1;
71
72
73
74###########################################################################
75# file saving / loading stuff
76###########################################################################
77
78sub initialise_acronyms {
79
80 my $local_max_offset = $max_offset;
81 my $local_upper_case = $upper_case;
82 my $local_case_match = $case_match ;
83 my $local_min_def_length = $min_def_length;
84 my $local_min_acro_length = $min_acro_length;
85 my $local_min_length_saving = $min_length_saving;
86 my $local_allow_recursive = $allow_recursive;
87 my $local_allow_all_caps = $allow_all_caps;
88 my @local_stop_words = @stop_words;
89 my $local_acronym_accumulate_file = $acronym_accumulate_file;
90
91
92 my $file_text = "";
93 if (open ACRONYM_HANDLE, "<$acronym_options_file")
94 {
95 $file_text = do { local $/; <ACRONYM_HANDLE> };
96 }
97 if ($file_text eq "")
98 {
99 print STDERR "failed to open $acronym_options_file\n";
100 open ACRONYM_HANDLE, ">$acronym_options_file\n";
101 print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
102 print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
103 print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
104 print ACRONYM_HANDLE "#valid here.\n\n";
105 print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
106 print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
107 print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
108 print ACRONYM_HANDLE "#explain them fully \n\n";
109 print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
110 print ACRONYM_HANDLE "\$local_max_offset = $max_offset;\n\n";
111 print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true (high precision))\n";
112 print ACRONYM_HANDLE "\$local_upper_case = $upper_case;\n\n";
113 print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true (high precision))\n";
114 print ACRONYM_HANDLE "\$local_case_match = $case_match;\n\n";
115 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
116 print ACRONYM_HANDLE "\$local_min_def_length = $min_def_length;\n\n";
117 print ACRONYM_HANDLE "#let definitions be all capitals\n";
118 print ACRONYM_HANDLE "\$local_allow_all_caps = $allow_all_caps;\n\n";
119 print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
120 print ACRONYM_HANDLE "\$local_min_acro_length = 3;\n\n";
121 print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
122 print ACRONYM_HANDLE "\$local_min_length_saving = 4;\n\n";
123 print ACRONYM_HANDLE "#allow recusive acronyms (0 = false (high precision), 1 = true)\n";
124 print ACRONYM_HANDLE "\$local_allow_recursive = 0;\n\n";
125 print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
126 print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
127 print ACRONYM_HANDLE "#\@local_stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
128 print ACRONYM_HANDLE "\@local_stop_words = split / /, \"OF AT THE IN TO AND\";\n";
129 print ACRONYM_HANDLE "\n";
130 print ACRONYM_HANDLE "#the file to collate acronyms into\n";
131 print ACRONYM_HANDLE "\$local_acronym_accumulate_file = \$ENV{'GSDLCOLLECTDIR'} . \"/etc/acronym_definitions.pm\";\n";
132 print ACRONYM_HANDLE "\n";
133 print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
134 print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
135 print ACRONYM_HANDLE "# \n";
136 print ACRONYM_HANDLE "# \n";
137 print ACRONYM_HANDLE "# \n";
138 print STDERR "written new options file to $acronym_options_file...\n";
139 } else {
140 print STDERR "read file $acronym_options_file...\n";
141 eval $file_text ;
142 warn $@ if $@;
143 print STDERR "evaluated file $acronym_options_file...\n";
144 }
145
146 $max_offset = $local_max_offset;
147 $upper_case = $local_upper_case;
148 $case_match = $local_case_match ;
149 $min_def_length = $local_min_def_length;
150 $min_acro_length = $local_min_acro_length;
151 $min_length_saving = $local_min_length_saving;
152 $allow_recursive = $local_allow_recursive;
153 $allow_all_caps = $local_allow_all_caps;
154 @stop_words = @local_stop_words;
155
156 $local_acronym_accumulate_file = $local_acronym_accumulate_file;
157
158
159 &read_all_acronyms_from_file();
160# rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
161# int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
162 if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
163 {
164 print ACRONYM_HANDLE "#This is an automatically generated file.\n";
165 print ACRONYM_HANDLE "#\n";
166 print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
167 print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
168 print ACRONYM_HANDLE "#read-only. \n";
169 print ACRONYM_HANDLE "#\n";
170 print ACRONYM_HANDLE "#start of acronyms...\n";
171 $writing_acronyms = 1;
172 } else {
173 warn "failed to open $acronym_accumulate_file for writing\n";
174 $writing_acronyms = 0;
175 }
176}
177
178#close the list of accumulated acronyms
179sub finalise_acronyms {
180 if ($writing_acronyms)
181 {
182 print ACRONYM_HANDLE "#end of acronyms.\n";
183 close ACRONYM_HANDLE;
184 }
185}
186
187#eval a file of accumulated acronyms
188sub read_all_acronyms_from_file {
189
190 my $file_text = " ";
191 if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
192 {
193 $file_text = do { local $/; <ACRONYM_HANDLE> };
194 } else {
195 print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
196 }
197 eval $file_text;
198 #promotes warnings/errors from evaluated file to current context
199 warn $@ if $@;
200}
201
202#called from within the file of accumulated acronyms to indicate a good acronym
203sub add {
204 my $self = shift (@_);
205 if (defined ($acronyms_found_in_collection{$self->[0]}))
206 {
207 my $def = $self->to_def_string();
208 if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
209 {
210 return;
211 }
212 $acronyms_found_in_collection{$self->[0]} =
213 $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
214 } else {
215 $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
216 }
217}
218
219#called from within the file of accumulated acronyms to indicate a bad acronym
220sub ban {
221 my $self = shift (@_);
222
223 if (!defined $acronyms_banned_from_collection{$self->[0]})
224 {
225 $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
226 } else {
227 $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
228 }
229}
230
231
232#write a good acronym to the accumulated acronyms file
233sub write_to_file {
234 my $self = shift (@_);
235 if ($writing_acronyms)
236 {
237 print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
238 $self->to_def_string() .
239 "\")->add();\n";
240 }
241}
242
243
244###########################################################################
245# mark functionality
246###########################################################################
247
248#small routine to sort by length
249sub sort_by_length {
250 length($b) <=> length($a) or $a cmp $b
251}
252
253sub markup_acronyms {
254 my $text = shift (@_);
255 my $verbosity_obj = shift (@_);
256 if (defined $text)
257 {
258 for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
259 {
260 $text =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
261 printf STDERR " " . $acro . ","
262 if ($verbosity_obj->{'verbosity'} >= 2);
263 }
264 }
265 return $text;
266}
267
268
269
270###########################################################################
271# member functions
272###########################################################################
273
274
275sub new {
276 my $trash = shift (@_);
277 my $acro = shift (@_);
278 my $def = shift (@_);
279
280 my $self = [
281 "", # 0 acronym
282 [], # 1 definition
283 ];
284
285 $self->[0] = $acro if defined $acro;
286 push @{$self->[1]}, split / /, $def if defined $def;
287
288 bless $self;
289}
290
291sub clone {
292 my $self = shift (@_);
293
294 my $copy = new acronym;
295
296 $copy->[0] = $self->[0];
297 push @{$copy->[1]}, @{$self->[1]};
298 bless $copy;
299
300 return $copy;
301}
302
303#return the acronym
304sub to_acronym {
305 my $self = shift (@_);
306 my @array = @{$self->[1]};
307
308 return $self->[0];
309}
310
311#return the number of words in the acronym definition
312sub words_in_acronym_definition {
313 my $self = shift (@_);
314 my @array = @{$self->[1]};
315
316 return $#array + 1;
317}
318
319#return the number of letters in the acronym definition
320sub letters_in_acronym_definition {
321 my $self = shift (@_);
322
323 return length($self->to_def_string());
324}
325
326#return the number of letters in the acronym definition
327sub letters_in_acronym {
328 my $self = shift (@_);
329
330 return length($self->to_acronym());
331}
332
333#return the acronym definition
334sub to_def_string {
335 my $self = shift (@_);
336
337 my $result = "";
338
339 # do the definition
340 my @array = @{$self->[1]};
341 my $i = 0;
342 while ($i <= $#array)
343 {
344 $result = $result . $array[$i];
345
346 if ($i+1 <= $#array)
347 {
348 $result = $result . " ";
349 }
350 $i++;
351 }
352 return $result;
353}
354
355
356#print out the kwic for the acronym
357sub to_string_kwic {
358 my $self = shift (@_);
359
360 # the list of all possible combinations
361 my @list = ();
362
363 my $result = "";
364
365 my $j = 0;
366 my @array = @{$self->[1]};
367 while ($j <= $#array)
368 {
369
370 # do the definition
371 my $i = 0;
372
373 #add the key word
374 $result = "<td halign=left>" . $array[$j] . "</td><td halign=right>";
375
376 #add the proceeding words
377 while ($i < $j)
378 {
379 $result = $result . $array[$i] . " ";
380 $i++;
381 }
382 #add the key word
383 $result = $result . "</td><td halign=left>" . $array[$j] .
384 "</td><td halign=left>";
385
386 #add the trailing words
387 $i++;
388 while ($i <= $#array )
389 {
390 $result = $result . $array[$i] . " ";
391 $i++;
392 }
393
394 #add the actual acronym
395
396 $result = $result . "</td><td halign=left>" . $self->[0] . "</td>";
397
398 push @list, $result;
399 $j++;
400 }
401 return @list;
402}
403
404#this is the one used when building the collection ...
405sub to_string {
406 my $self = shift (@_);
407
408 my $result = $self->[0] . " ";
409
410 # do the definition
411 my @array = @{$self->[1]};
412 my $i = 0;
413 while ($i <= $#array)
414 {
415 $result = $result . $array[$i];
416 if ($i+1 <= $#array)
417 {
418 $result = $result . " ";
419 }
420 $i++;
421 }
422 return $result;
423}
424
425sub check {
426 my $self = shift (@_);
427
428 if (length($self->to_acronym()) < $min_acro_length)
429 {
430# print "acronym " . $self->to_string() . " rejected (too short I)\n";
431 return 0;
432 }
433 if ($self->words_in_acronym_definition() < $min_def_length)
434 {
435# print "acronym " . $self->to_string() . " rejected (too short II)\n";
436 return 0;
437 }
438 if ($min_length_saving * $self->letters_in_acronym() >
439 $self->letters_in_acronym_definition())
440 {
441# print "acronym " . $self->to_string() . " rejected (too short III)\n";
442# print "" . $min_length_saving .
443# "|" . $self->letters_in_acronym() .
444# "|" . $self->letters_in_acronym_definition() . "\n";
445 return 0;
446 }
447 if (!$allow_all_caps &&
448 $self->to_def_string() eq uc($self->to_def_string()))
449 {
450# print "acronym " . $self->to_string() . " rejected (all upper)\n";
451 return 0;
452 }
453 if (!$allow_all_caps)
454 {
455 my $upper_count = 0;
456 my $lower_count = 0;
457 my @letters = $self->to_def_string();
458 for my $letter (split //, $self->to_def_string())
459 {
460 if ($letter eq uc($letter))
461 {
462 $upper_count++;
463 } else {
464 $lower_count++;
465 }
466 }
467 return 0 if ($upper_count > $lower_count);
468 }
469 if (!$allow_recursive && $self->to_def_string() =~ /$self->to_acronym()/i )
470 {
471 return 0;
472 }
473# print "acronym " . $self->to_string() . " not rejected\n";
474 return 1;
475}
476
477###########################################################################
478# static functions
479###########################################################################
480
481sub recurse {
482 my ($acro_offset, #offset of word we're finding acronyms for
483 $text_offset,
484 $letter_offset,
485 @def_so_far) = @_;
486
487 my $word = $split_text[$text_offset];
488 my $acro = $split_text[$acro_offset];
489 $word = "" if !defined $word;
490 $acro = "" if !defined $acro;
491
492# print "recurse(" . $acro_offset . ", " . $text_offset . ", " .
493# $letter_offset . ", " . @def_so_far . ")\n";
494
495 #check for termination ...
496 if ($letter_offset >= length($acro))
497 {
498 my $acronym = new acronym();
499 $acronym->[0] = $acro;
500 push @{$acronym->[1]}, @def_so_far;
501 if ($acronym->check())
502 {
503 push @acronym_list, ( $acronym );
504 }
505# print "acronym created\n";
506 return;
507 }
508 #check for recursion
509 if (!$allow_recursive)
510 {
511 if ($word eq $acro)
512 {
513# print "recursion detected\n";
514 return;
515 }
516 }
517
518 #skip a stop-word
519 my $i = 0;
520 if ($letter_offset != 0)
521 {
522 while ($i <= $#stop_words)
523 {
524 if ($stop_words[$i] eq uc($word))
525 {
526# print "::found stop word::" . $stop_words[$i] . "\n";
527 &recurse($acro_offset,
528 $text_offset+1,
529 $letter_offset,
530 @def_so_far, $word);
531 }
532 $i++;
533 }
534 }
535 $i = 1;
536 #using the first $i letters ...
537 while ($letter_offset+$i <= length($acro) )
538 {
539# print "". ((substr $word, 0, $i) . " " .
540# (substr $acro, $letter_offset, $i) . "\n");
541 if (((!$case_match) &&
542 (uc(substr $word, 0, $i) eq
543 uc(substr $acro, $letter_offset, $i)))
544 ||
545 (($case_match) &&
546 ((substr $word, 0, $i) eq
547 (substr $acro, $letter_offset, $i))))
548 {
549# print "::match::\n";
550# print "" . ((substr $word, 0, $i) . " " .
551# (substr $acro, $letter_offset, $i) . "\n");
552 &recurse($acro_offset,
553 $text_offset+1,
554 $letter_offset+$i,
555 @def_so_far, $word);
556 } else {
557 return;
558 }
559 $i++;
560 }
561 return;
562}
563
564#the main
565sub acronyms {
566 #clean up the text
567 my $processed_text = shift @_;
568 $$processed_text =~ s/<[^>]*>/ /g;
569 $$processed_text =~ s/\W/ /g;
570 $$processed_text =~ s/[0-9_]/ /g;
571 $$processed_text =~ s/\s+/ /g;
572 $$processed_text =~ s/(\n|\>)References.*/ /i;
573 $$processed_text =~ s/(\n|\>)Bibliography.*/ /i;
574 $$processed_text =~ s/(\n|\>)(Cited Works?).*/ /i;
575 $$processed_text =~ s/(\n|\>)(Works? Cited).*/ /i;
576
577 #clear some global variables
578 @split_text = ();
579 @acronym_list = ();
580
581 return &acronyms_from_clean_text($processed_text);
582}
583
584sub acronyms_from_clean_text {
585 my ($processed_text) = @_;
586
587 @split_text = split / /, $$processed_text;
588
589# my $i = 0;
590# while ($i <= $#split_text)
591# {
592# print $split_text[$i] . "\n";
593# $i++;
594# }
595
596 my $first = 0;
597 my $last = $#split_text +1;
598 my $acro_counter = $first;
599
600 while ($acro_counter < $last)
601 {
602 my $word = $split_text[$acro_counter];
603
604 if ((!$upper_case) ||
605 (uc($word) eq $word))
606 {
607
608 if (length $word >= $min_acro_length)
609 {
610 my $def_counter = 0;
611 if ($acro_counter - $max_offset > 0)
612 {
613 $def_counter = $acro_counter - $max_offset;
614 }
615 my $local_last = $acro_counter + $max_offset;
616 if ($local_last > $last)
617 {
618 $local_last = $last;
619 }
620 while ($def_counter <= $local_last)
621 {
622 &recurse($acro_counter,$def_counter,0,());
623 $def_counter++;
624 }
625 }
626 }
627 $acro_counter++;
628 }
629
630 return \@acronym_list;
631}
632
633
634
635sub test {
636
637# my $blarg = new acronym();
638# my $simple;
639# $simple = 10;
640# $blarg->initialise($simple, $simple, $simple);
641# my $blarg2 = $blarg->clone();
642# print $blarg->to_string();
643# print $blarg2;
644# print "\n";
645#
646 my $tla = new acronym();
647 $tla->[0] = "TLA";
648
649 my @array = ("Three", "Letter", "Acronym");
650# my $i = 0;
651# while ($i <= $#array)
652# {
653# print @array[$i] . "\n";
654# $i++;
655# }
656
657 print "\n";
658 push @{$tla->[1]}, ("Three" );
659 push @{$tla->[1]}, ("Letter" );
660 push @{$tla->[1]}, ("Acronym" );
661 print $tla->to_string(). "\n";
662 print "\n";
663 print "\n";
664 my $tla2 = $tla->clone();
665 push @{$tla2->[1]}, ("One");
666 push @{$tla2->[1]}, ("Two");
667 $tla2->[0] = "ALT";
668 print $tla->to_string(). "\n";
669 print $tla2->to_string(). "\n";
670
671 print "\n";
672 print "\n";
673
674 print "Testing recursion ...\n";
675 my $acros = &acronyms("TLA Three Letter Acronym in tla TlA");
676
677 foreach my $acro (@$acros)
678 {
679 if ($acro->check)
680 {
681 print "accepted: " .$acro->to_string() . "\n";
682# print "|" . $acro->to_acronym() . "|" . $acro->to_def_string() .
683# "|" . $acro->words_in_acronym_definition() .
684# "|" . $acro->letters_in_acronym_definition() .
685# "|" . $acro->letters_in_acronym() . "|\n";
686 } else {
687# print "found but rejected: " .$acro->to_string() . "\n";
688 }
689 }
690}
691
692#uncomment this line to test this package
693#&test();
694
6951;
696
697
Note: See TracBrowser for help on using the repository browser.