source: main/trunk/greenstone2/perllib/sorttools.pm@ 33898

Last change on this file since 33898 was 33898, checked in by kjdon, 4 years ago

format_metadata_for_sorting now takes two additional args - casefold and accentfold. Do these to the metadata value if they are set to true

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1###########################################################################
2#
3# sorttools.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# various subroutines to format strings
27# for sorting
28
29# To customise sort functions for a particular collection, create a customsorttools.pm inside
30# collection's perllib folder, and implement any of the functions you want to change.
31
32package sorttools;
33
34use strict;
35use Unicode::Normalize;
36
37my $has_custom_sort = 0;
38
39sub setup_custom_sort {
40
41 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
42 my $customperllibfolder = &FileUtils::filenameConcatenate($collectdir, 'perllib');
43 my $customsortfile = &FileUtils::filenameConcatenate($customperllibfolder, 'customsorttools.pm');
44 if (&FileUtils::fileExists($customsortfile)) {
45 # add perllib folder to INC, if its not already there
46 my $found_perllibfolder = 0;
47 foreach my $path (@INC)
48 {
49 if ($path eq $customperllibfolder)
50 {
51 $found_perllibfolder = 1;
52 last;
53 }
54 }
55 if (!$found_perllibfolder)
56 {
57 unshift (@INC, $customperllibfolder);
58 }
59
60 require customsorttools;
61 $has_custom_sort = 1;
62 }
63
64}
65
66# moved here from BasClas so import can share it
67sub format_metadata_for_sorting {
68 my ($metaname, $metavalue, $doc_obj, $casefold, $accentfold) = @_;
69
70 if (!defined $metaname || $metaname !~ /\S/ || ! defined $metavalue || $metavalue !~ /\S/) {
71 return "";
72 }
73
74 if ($has_custom_sort && defined (&customsorttools::format_metadata_for_sorting)) {
75 return &customsorttools::format_metadata_for_sorting($metaname, $metavalue, $doc_obj, $casefold, $accentfold);
76 }
77
78 if ($metaname eq "Language") {
79 $metavalue = $iso639::fromiso639{$metavalue};
80 return $metavalue;
81 }
82
83 my $lang;
84 if (defined $doc_obj) {
85 $lang = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), 'Language');
86 }
87 $lang = 'en' unless defined $lang;
88
89 if (defined $casefold && $casefold eq "true") {
90 $metavalue = lc($metavalue);
91 }
92 if (defined $accentfold && $accentfold eq "true") {
93 $metavalue = NFKD($metavalue);
94 $metavalue =~ s/\p{NonspacingMark}//g;
95 }
96
97 # is this metadata likely to be a name?
98 my $function_name="format_string_name_$lang";
99 if ($metaname =~ /^(?:\w+\.)?(?:Creators?|Authors?|Editors?)(?:[:,].*)?$/
100 && exists &$function_name) {
101 no strict 'refs';
102 &$function_name(\$metavalue);
103 } else {
104 $function_name="format_string_$lang";
105 if (exists &$function_name) {
106 no strict 'refs';
107 &$function_name(\$metavalue);
108 }
109 }
110
111 return $metavalue;
112}
113
114### language-specific sorting functions (called by format_metadata_for_sorting)
115
116## format_string_$lang() converts to lowercase (where appropriate), and
117# removes punctuation, articles from the start of string, etc
118## format_string_name_$lang() converts to lowercase, puts the surname first,
119# removes punctuation, etc
120
121sub format_string_en {
122 my $stringref = shift;
123
124 if ($has_custom_sort && defined (&customsorttools::format_string_en)) {
125 return &customsorttools::format_string_en($stringref);
126 }
127
128 $$stringref =~ s/&[^\;]+\;//g; # html entities
129 $$stringref =~ s/^\s*(the|a|an)\b//; # articles
130 $$stringref =~ s/[^[:alnum:]]//g; # remove any non-alphanumeric chars from start of the string
131 $$stringref =~ s/\s+/ /g;
132 $$stringref =~ s/^\s+//;
133 $$stringref =~ s/\s+$//;
134}
135
136sub format_string_name_en {
137 my ($stringref) = @_;
138
139 if ($has_custom_sort && defined (&customsorttools::format_string_name_en)) {
140 return &customsorttools::format_string_name_en($stringref);
141 }
142
143 $$stringref =~ s/&\S+;//g;
144
145 my $comma_format = ($$stringref =~ m/^.+,.+$/);
146 $$stringref =~ s/[[:punct:]]//g;
147 $$stringref =~ s/\s+/ /g;
148 $$stringref =~ s/^\s+//;
149 $$stringref =~ s/\s+$//;
150
151
152 if (!$comma_format) {
153 # No commas in name => name in 'firstname surname' format
154 # need to sort by surname
155 my @names = split / /, $$stringref;
156 my $surname = pop @names;
157 while (scalar @names && $surname =~ /^(jnr|snr)$/i) {
158 $surname = pop @names;
159 }
160 $$stringref = $surname . " " . $$stringref;
161 }
162}
163
164
165sub format_string_fr {
166 my $stringref = shift;
167
168 if ($has_custom_sort && defined (&customsorttools::format_string_fr)) {
169 return &customsorttools::format_string_fr($stringref);
170 }
171
172 $$stringref =~ s/&[^\;]+\;//g; # html entities
173 $$stringref =~ s/^\s*(les?|la|une?)\b//; # articles
174 $$stringref =~ s/[^[:alnum:]]//g;
175 $$stringref =~ s/\s+/ /g;
176 $$stringref =~ s/^\s+//;
177 $$stringref =~ s/\s+$//;
178}
179
180sub format_string_es {
181 my $stringref = shift;
182
183 if ($has_custom_sort && defined (&customsorttools::format_string_es)) {
184 return &customsorttools::format_string_es($stringref);
185 }
186
187 $$stringref =~ s/&[^\;]+\;//g; # html entities
188 $$stringref =~ s/^\s*(la|el)\b//; # articles
189 $$stringref =~ s/[^[:alnum:]]//g;
190 $$stringref =~ s/\s+/ /g;
191 $$stringref =~ s/^\s+//;
192 $$stringref =~ s/\s+$//;
193}
194
195### end of language-specific functions
196
197# takes arguments of day, month, year and converts to
198# date of form yyyymmdd. month may be full (e.g. "January"),
199# abbreviated (e.g. "Jan"), or a number (1-12). Years like "86"
200# will be assumed to be "1986".
201sub format_date {
202 my ($day, $month, $year) = @_;
203
204 if ($has_custom_sort && defined (&customsorttools::format_date)) {
205 return &customsorttools::format_date($day, $month, $year);
206 }
207
208 my %months = ('january' => '01', 'jan' => '01', 'february' => '02', 'feb' => '02',
209 'march' => '03', 'mar' => '03', 'april' => '04', 'apr' => '04',
210 'may' => '05', 'june' => '06', 'jun' => '06', 'july' => '07',
211 'jul' => '07', 'august' => '08', 'aug' => '08', 'september' => '09',
212 'sep' => '09', 'october' => '10', 'oct' => '10', 'november' => '11',
213 'nov' => '11', 'december' => '12', 'dec' => '12');
214
215 $month =~ tr/A-Z/a-z/;
216
217 if ($day < 1) {
218 print STDERR "sorttools::format_date WARNING day $day out of range\n";
219 $day = "01";
220 } elsif ($day > 31) {
221 print STDERR "sorttools::format_date WARNING day $day out of range\n";
222 $day = "31";
223 }
224
225 $day = "0$day" if (length($day) == 1);
226
227 if ($month =~ /^\d\d?$/) {
228 if ($month < 1) {
229 print STDERR "sorttools::format_date WARNING month $month out of range\n";
230 $month = "01";
231 } elsif ($month > 12) {
232 print STDERR "sorttools::format_date WARNING month $month out of range\n";
233 $month = "12";
234 }
235 if ($month =~ /^\d$/) {
236 $month = "0" . $month;
237 }
238 } elsif (!defined $months{$month}) {
239 print STDERR "sorttools::format_date WARNING month $month out of range\n";
240 $month = "01";
241 } else {
242 $month = $months{$month};
243 }
244
245 if ($year !~ /^\d\d\d\d$/) {
246 if ($year !~ /^\d\d$/) {
247 my $newyear = 1900 + $year;
248 print STDERR "sorttools::format_date WARNING year $year assumed to be $newyear\n";
249 $year=$newyear;
250 } else {
251 print STDERR "sorttools::format_date WARNING year $year out of range - reset to 1900\n";
252 $year = "1900";
253 }
254 }
255
256 return "$year$month$day";
257}
258
259
2601;
Note: See TracBrowser for help on using the repository browser.