source: main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/perllib/classify/DateList.pm@ 36609

Last change on this file since 36609 was 36609, checked in by anupama, 20 months ago

committing modified perl code for heritage

  • Property svn:executable set to *
File size: 14.7 KB
Line 
1###########################################################################
2#
3# DateList.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# classifier plugin for sorting by date
27
28# date is assumed to be in the form yyyymmdd
29
30# options
31# -bymonth: splits by month as well as year
32# -nogroup, which makes each year (or year+month) an individual entry in
33# the horizontal list and prevents compaction
34# -metadata, use a different metadata for the date (instead of Date), still expects yyyymmdd format. this affects display cos greenstone displays Date metadata as dd month yyyy, whereas any other date metadata is displayed as yyyymmdd - this needs fixing
35# -sort specifies an additional metadata to use in sorting, will take affect when two docs have the same date.
36# -reverse_sort - sort in reverse order
37# -no_special_formatting - makes the list a VList instead of a DateList - don't display Months down the side of the list
38# -valid_date_regex - what constitutes a valid date? deft is \d\d\d\d. eg for Heritage, want to customise this to allow dates like 198?
39# -allow_invalid_dates - do we include in the classifier documents with invalid dates? deft = no.
40# -invalid_date_partition_name - if docs with invalid dates are included, they get put into one bucket. this gives the name of that bucket. eg "no date"
41package DateList;
42
43use BaseClassifier;
44use sorttools;
45
46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
49sub BEGIN {
50 @DateList::ISA = ('BaseClassifier');
51}
52
53my $arguments =
54 [ { 'name' => "metadata",
55 'desc' => "{DateList.metadata}",
56 'type' => "metadata",
57 'deft' => "Date",
58 'reqd' => "yes" } ,
59 { 'name' => "sort",
60 'desc' => "{DateList.sort}",
61 'type' => "metadata",
62 'reqd' => "no" } ,
63 { 'name' => "reverse_sort",
64 'desc' => "{DateList.reverse_sort}",
65 'type' => "flag",
66 'reqd' => "no" },
67 { 'name' => "bymonth",
68 'desc' => "{DateList.bymonth}",
69 'type' => "flag",
70 'reqd' => "no" },
71 { 'name' => "nogroup",
72 'desc' => "{DateList.nogroup}",
73 'type' => "flag",
74 'reqd' => "no" },
75 { 'name' => "no_special_formatting",
76 'desc' => "{DateList.no_special_formatting}",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "valid_date_regex",
80 'desc' => "{DateList.valid_date_regex}",
81 'type' => "regexp",
82 'deft' => "\\d\\d\\d\\d",
83 'reqd' => "no" },
84 { 'name' => "allow_invalid_dates",
85 'desc' => "{DateList.allow_invalid_dates}",
86 'type' => "flag",
87 'reqd' => "no" },
88 { 'name' => "invalid_date_partition_name",
89 'desc' => "{DateList.invalid_date_partition_name}",
90 'type' => "string",
91 'deft' => "No Date" }
92 ];
93
94my $options = { 'name' => "DateList",
95 'desc' => "{DateList.desc}",
96 'abstract' => "no",
97 'inherits' => "yes",
98 'args' => $arguments };
99
100
101sub new {
102 my ($class) = shift (@_);
103 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
104 push(@$classifierslist, $class);
105
106 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107 push(@{$hashArgOptLists->{"OptList"}},$options);
108
109 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
110
111 if ($self->{'info_only'}) {
112 # don't worry about any options etc
113 return bless $self, $class;
114 }
115
116 # Manually set $self parameters.
117 $self->{'list'} = {};
118
119 if (!defined $self->{"metadata"} || $self->{"metadata"} eq "") {
120 $self->{'metadata'} = "Date";
121 }
122 # remove any ex.s
123 $self->{'metadata'} = $self->strip_ex_from_metadata($self->{'metadata'});
124 $self->{'sort'} = $self->strip_ex_from_metadata($self->{'sort'});
125
126 # now can have comma separated list of Dates - we just use the first value found (for now)
127 my @meta_list = split(/,/, $self->{"metadata"});
128 $self->{'meta_list'} = \@meta_list;
129
130 $self->{'buttonname'} = $self->generate_title_from_metadata($self->{'metadata'}) unless ($self->{'buttonname'});
131
132 $self->{'childtype'} = "DateList";
133 if ($self->{'no_special_formatting'}) {
134 $self->{'childtype'} = "VList";
135 }
136
137 return bless $self, $class;
138}
139
140sub init {
141 my $self = shift (@_);
142
143 $self->{'list'} = {};
144}
145
146sub classify {
147 my $self = shift (@_);
148 my ($doc_obj) = @_;
149
150 my $doc_OID = $doc_obj->get_OID();
151 my $outhandle = $self->{'outhandle'};
152 my $verbosity = $self->{'verbosity'};
153 # find the first available metadata
154 my $date;
155 my $invalid = 0;
156 my $validre = $self->{'valid_date_regex'};
157
158 foreach my $m (@{$self->{'meta_list'}}) {
159 $date = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $m);
160 last if defined $date;
161 }
162
163 if (!defined $date || $date eq "") {
164 if (!$self->{'allow_invalid_dates'}) {
165 # if this document doesn't contain Date element we won't
166 # include it in this classification
167 print $outhandle "DateList: $doc_OID has no date, not including it\n" if $verbosity >=2;
168 return;
169 } else {
170 $invalid = 1;
171 $date = "INVALID";
172 }
173 }
174
175 # sanity check date
176 #if ($date !~ /^\d\d\d\d.*/) {
177 if ($date !~ /^$validre.*/) {
178 if (!$self->{'allow_invalid_dates'}) {
179 print $outhandle "DateList: $doc_OID date: '$date' malformed: expected it to start with $validre; not classifying\n" if $verbosity >=2;
180 return;
181 }
182 else {
183 $invalid = 1;
184 $date = "INVALID";
185 }
186 }
187 if ($self->{'bymonth'} && !$invalid) {
188 # check that we have valid month - if not, set it to 00 == undefined
189 if ($date !~ /^$validre-?\d\d/) {
190 print $outhandle "DateList $doc_OID date: '$date' has no month (expecting yyyymm... or yyyy-mm...), setting date to yyyy-00\n" if $verbosity >=2;
191 $date =~ s/^($validre).*$/$1-00/;
192 } else {
193 my ($year, $month) = $date =~ /^($validre)-?(\d\d)/;
194 if ($month > 12) {
195 print $outhandle "DateList $doc_OID date: '$date' has invalid month, setting date to $year-00\n" if $verbosity >=2;
196 $date = "$year-00";
197 }
198 }
199 }
200
201 my $sort_other = "";
202 if (defined $self->{'sort'} && $self->{'sort'} ne "") {
203 $sort_other = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sort'});
204 $sort_other = &sorttools::format_metadata_for_sorting($self->{'sort'}, $sort_other, $doc_obj, $self->{'casefold_metadata_for_sorting'}, $self->{'accentfold_metadata_for_sorting'}) unless $self->{'no_metadata_formatting'};
205 }
206
207 if (defined $self->{'list'}->{$doc_OID}) {
208
209 print $outhandle "WARNING: DateList::classify called multiple times for $doc_OID, overwriting previous stored date value ($self->{'list'}->{$doc_OID}) with $date$sort_other \n";
210 }
211
212 $self->{'list'}->{$doc_OID} = "$date$sort_other";
213
214}
215
216
217sub get_classify_info {
218 my $self = shift (@_);
219
220 my @classlist = sort {$self->{'list'}->{$a} cmp $self->{'list'}->{$b};} keys %{$self->{'list'}};
221
222 if ($self->{'reverse_sort'}) {
223 @classlist = reverse @classlist;
224 }
225
226
227 return $self->splitlist (\@classlist);
228}
229
230
231sub get_entry {
232 my $self = shift (@_);
233 my ($title, $childtype, $thistype) = @_;
234
235 # organise into classification structure
236 my %classifyinfo = ('childtype'=>$childtype,
237 'Title'=>$title,
238 'contains'=>[],
239 'mdtype'=>$self->{'metadata'});
240 $classifyinfo{'thistype'} = $thistype
241 if defined $thistype && $thistype =~ /\w/;
242
243 return \%classifyinfo;
244}
245
246# splitlist takes an ordered list of classifications (@$classlistref) and
247# splits it up into sub-sections by date
248sub splitlist {
249 my $self = shift (@_);
250 my ($classlistref) = @_;
251 my $classhash = {};
252
253 # top level
254 my $childtype = "HList";
255
256 if (scalar (@$classlistref) <= 39 &&
257 !$self->{'nogroup'}) {$childtype = $self->{'childtype'};}
258
259 my $classifyinfo = $self->get_entry ($self->{'buttonname'}, $childtype, "Invisible");
260 # don't need to do any splitting if there are less than 39 (max + min -1)
261 # classifications, unless nogroup is specified
262 if ((scalar @$classlistref) <= 39 && !$self->{'nogroup'}) {
263 foreach my $subOID (@$classlistref) {
264 push (@{$classifyinfo->{'contains'}}, {'OID'=>$subOID});
265 }
266 return $classifyinfo;
267 }
268
269 my $validre = $self->{'valid_date_regex'};
270 my $invalid_bucket = $self->{'invalid_date_partition_name'};
271
272 if ($self->{'bymonth'}) {
273 # first split up the list into separate year+month classifications
274
275 if (!$self->{'nogroup'}) { # hlist of year+month pairs
276 # single level of classifications
277 foreach my $classification (@$classlistref) {
278 my $date = $self->{'list'}->{$classification};
279 $date =~ s/^(\d\d\d\d)-?(\d\d).*$/$1&nbsp;_textmonth$2_/;
280 $classhash->{$date} = [] unless defined $classhash->{$date};
281 push (@{$classhash->{$date}}, $classification);
282 }
283
284 } else { # -nogroup - individual years and months
285 foreach my $classification (@$classlistref) {
286 my $date = $self->{'list'}->{$classification};
287 $date =~ s/^(\d\d\d\d)-?(\d\d).*$/$1&nbsp;_textmonth$2_/;
288 my ($year, $month)=($1,$2);
289 # create subclass if it doesn't already exist
290 $classhash->{$year} = () unless defined $classhash->{$year};
291
292 $classhash->{$year}->{$month} = []
293 unless defined $classhash->{$year}->{$month};
294 push (@{$classhash->{$year}->{$month}}, $classification);
295
296 }
297 # create hlist of years containing hlists of months
298
299 my @subclasslist = sort {$a <=> $b} (keys %$classhash);
300 if ($self->{'reverse_sort'}) {
301 @subclasslist = reverse @subclasslist;
302 }
303
304 foreach my $subclass (@subclasslist) {
305 my $yearclassify = $self->get_entry($subclass, "HList");
306 my @subsubclasslist = sort {$a <=> $b} (keys %{$classhash->{$subclass}});
307 if ($self->{'reverse_sort'}) {
308 @subsubclasslist = reverse @subsubclasslist;
309 }
310
311 foreach my $subsubclass (@subsubclasslist) {
312 my $monthname=$subsubclass;
313 $monthname="_textmonth" . $monthname . "_";
314 my $monthclassify=$self->get_entry($monthname, $self->{'childtype'});
315 push (@{$yearclassify->{'contains'}}, $monthclassify);
316
317 foreach my $subsubOID
318 (@{$classhash->{$subclass}->{$subsubclass}}) {
319 push (@{$monthclassify->{'contains'}},
320 {'OID'=>$subsubOID});
321 }
322 }
323 push (@{$classifyinfo->{'contains'}}, $yearclassify);
324 }
325
326 return $classifyinfo;
327 } # nogroup
328 } else {
329 # not by month
330 # first split up the list into separate year classifications
331 foreach my $classification (@$classlistref) {
332 my $date = $self->{'list'}->{$classification};
333 if ($date =~ /^INVALID/) {
334 $date = $invalid_bucket;
335 } else {
336 $date =~ s/^($validre).*$/$1/;
337 }
338 $classhash->{$date} = [] unless defined $classhash->{$date};
339 push (@{$classhash->{$date}}, $classification);
340 }
341
342 }
343
344 # only compact the list if nogroup not specified
345 if (!$self->{'nogroup'}) {
346 print STDERR "compacting list\n";
347 $classhash = $self->compactlist ($classhash);
348 }
349 my @subclasslist = sort keys %$classhash;
350 if ($self->{'reverse_sort'}) {
351 @subclasslist = reverse @subclasslist;
352 }
353 foreach my $subclass (@subclasslist) {
354 my $tempclassify = $self->get_entry($subclass, $self->{'childtype'});
355 foreach my $subsubOID (@{$classhash->{$subclass}}) {
356 push (@{$tempclassify->{'contains'}}, {'OID'=>$subsubOID});
357 }
358 push (@{$classifyinfo->{'contains'}}, $tempclassify);
359 }
360
361 return $classifyinfo;
362}
363
364sub compactlist {
365 my $self = shift (@_);
366 my ($classhashref) = @_;
367 my $compactedhash = {};
368 my @currentOIDs = ();
369 my $currentfirstdate = "";
370 my $currentlastdate = "";
371 my $lastkey = "";
372
373 # minimum and maximum documents to be displayed per page.
374 # the actual maximum will be max + (min-1).
375 # the smallest sub-section is a single letter at present
376 # so in this case there may be many times max documents
377 # displayed on a page.
378 my $min = 10;
379 my $max = 30;
380 my @subsectionlist = sort keys %$classhashref;
381 if ($self->{'reverse_sort'}) {
382 @subsectionlist = reverse @subsectionlist;
383 }
384 foreach my $subsection (@subsectionlist) {
385 if ($subsection eq $self->{'invalid_date_partition_name'}) {
386 # leave this one as is
387 $compactedhash->{$subsection} = $classhashref->{$subsection};
388 next;
389 }
390 # print STDERR "in sub $subsection\n";
391 $currentfirstdate = $subsection if $currentfirstdate eq "";
392 if ((scalar (@currentOIDs) < $min) ||
393 ((scalar (@currentOIDs) + scalar (@{$classhashref->{$subsection}})) <= $max)) {
394 push (@currentOIDs, @{$classhashref->{$subsection}});
395 $currentlastdate = $subsection;
396 } else {
397 if ($currentfirstdate eq $currentlastdate) {
398 @{$compactedhash->{$currentfirstdate}} = @currentOIDs;
399 $lastkey = $currentfirstdate;
400 } else {
401 @{$compactedhash->{"$currentfirstdate-$currentlastdate"}} = @currentOIDs;
402 $lastkey = "$currentfirstdate-$currentlastdate";
403 }
404 if (scalar (@{$classhashref->{$subsection}}) >= $max) {
405 $compactedhash->{$subsection} = $classhashref->{$subsection};
406 @currentOIDs = ();
407 $currentfirstdate = "";
408 $lastkey = $subsection;
409 } else {
410 @currentOIDs = @{$classhashref->{$subsection}};
411 $currentfirstdate = $subsection;
412 $currentlastdate = $subsection;
413 }
414 }
415 }
416
417 # add final OIDs to last sub-classification if there aren't many otherwise
418 # add final sub-classification
419 if (scalar (@currentOIDs) > 0) {
420 if ((scalar (@currentOIDs) < $min)) {
421
422 # want every thing in previous up to the dash
423 my ($newkey) = $lastkey =~ /^([^\-]+)/;
424 @currentOIDs = (@{$compactedhash->{$lastkey}}, @currentOIDs);
425 delete $compactedhash->{$lastkey};
426 @{$compactedhash->{"$newkey-$currentlastdate"}} = @currentOIDs;
427 } else {
428 if ($currentfirstdate eq $currentlastdate) {
429 @{$compactedhash->{$currentfirstdate}} = @currentOIDs;
430 } else {
431 @{$compactedhash->{"$currentfirstdate-$currentlastdate"}} = @currentOIDs;
432 }
433 }
434 }
435
436 return $compactedhash;
437}
438
4391;
Note: See TracBrowser for help on using the repository browser.