1 | # simple list classifier plugin
|
---|
2 | # collect.cfg usage: 'List metaelement' or simply 'List'
|
---|
3 | # the first will create a single list of all documents
|
---|
4 | # with metaelement metadata (sorted alphabetically by whichever
|
---|
5 | # metadata element was chosen).
|
---|
6 | # The second will create a single list of all documents
|
---|
7 | # which won't be sorted at all - they may well be in
|
---|
8 | # some completely random order ;-|
|
---|
9 |
|
---|
10 |
|
---|
11 | package List;
|
---|
12 |
|
---|
13 | sub new {
|
---|
14 | my ($class, @options) = @_;
|
---|
15 |
|
---|
16 | my $list = [];
|
---|
17 | if (defined $options[0]) {
|
---|
18 | $list = {};
|
---|
19 | }
|
---|
20 |
|
---|
21 | return bless {
|
---|
22 | 'OID'=>"NULL",
|
---|
23 | 'list'=>$list,
|
---|
24 | 'metaname' => $options[0]
|
---|
25 | }, $class;
|
---|
26 | }
|
---|
27 |
|
---|
28 | sub init {
|
---|
29 | my $self = shift (@_);
|
---|
30 |
|
---|
31 | if (defined $self->{'metaname'}) {
|
---|
32 | $self->{'list'} = {};
|
---|
33 | } else {
|
---|
34 | $self->{'list'} = [];
|
---|
35 | }
|
---|
36 | }
|
---|
37 |
|
---|
38 | sub set_OID {
|
---|
39 | my $self = shift (@_);
|
---|
40 | my ($OID) = @_;
|
---|
41 |
|
---|
42 | $self->{'OID'} = $OID;
|
---|
43 | }
|
---|
44 |
|
---|
45 | sub get_OID {
|
---|
46 | my $self = shift (@_);
|
---|
47 | return $self->{'OID'};
|
---|
48 | }
|
---|
49 |
|
---|
50 | sub classify {
|
---|
51 | my $self = shift (@_);
|
---|
52 | my ($doc_obj) = @_;
|
---|
53 |
|
---|
54 | my $doc_OID = $doc_obj->get_OID();
|
---|
55 |
|
---|
56 | if (defined $self->{'metaname'}) {
|
---|
57 | my $metavalue = $doc_obj->get_metadata_element ($doc_obj->get_top_section(),
|
---|
58 | $self->{'metaname'});
|
---|
59 | if (defined $metavalue) {
|
---|
60 | if ($self->{'metaname'} eq 'Creator') {
|
---|
61 | $self->format_string_name_english (\$metavalue);
|
---|
62 | } else {
|
---|
63 | $self->format_string_english (\$metavalue);
|
---|
64 | }
|
---|
65 | if (defined $self->{'list'}->{$doc_OID}) {
|
---|
66 | print STDERR "WARNING: List::classify called multiple times for $doc_OID\n";
|
---|
67 | }
|
---|
68 | $self->{'list'}->{$doc_OID} = $metavalue;
|
---|
69 | }
|
---|
70 | } else {
|
---|
71 | push (@{$self->{'list'}}, $doc_OID);
|
---|
72 | }
|
---|
73 | }
|
---|
74 |
|
---|
75 | sub output_classify_info {
|
---|
76 | my $self = shift (@_);
|
---|
77 | my ($handle) = @_;
|
---|
78 |
|
---|
79 | my $classifytitle = "List";
|
---|
80 | my $contains = "";
|
---|
81 | if (defined $self->{'metaname'}) {
|
---|
82 | $classifytitle = $self->{'metaname'};
|
---|
83 | if (keys %{$self->{'list'}}) {
|
---|
84 | $contains = join (";", sort {$self->{'list'}->{$a}
|
---|
85 | cmp $self->{'list'}->{$b};} keys %{$self->{'list'}}) . "\n";
|
---|
86 | }
|
---|
87 | } else {
|
---|
88 | if (scalar (@{$self->{'list'}}) > 0) {
|
---|
89 | $contains = join (";", @{$self->{'list'}}) . "\n";
|
---|
90 | }
|
---|
91 | }
|
---|
92 |
|
---|
93 | print $handle "[$self->{'OID'}]\n";
|
---|
94 | print $handle "<doctype>classify\n";
|
---|
95 | print $handle "<hastxt>0\n";
|
---|
96 | print $handle "<classifytype>List\n";
|
---|
97 | print $handle "<Title>$classifytitle\n";
|
---|
98 | print $handle "<contains>$contains\n" if $contains =~ /\w/;
|
---|
99 | print $handle '-' x 70, "\n";
|
---|
100 | }
|
---|
101 |
|
---|
102 |
|
---|
103 | # format an english name for sorting
|
---|
104 | # i.e. convert to lowercase, put surname before
|
---|
105 | # first names etc.
|
---|
106 | sub format_string_name_english {
|
---|
107 | my $self = shift (@_);
|
---|
108 | my ($stringref) = @_;
|
---|
109 |
|
---|
110 | $$stringref =~ tr/A-Z/a-z/;
|
---|
111 | $$stringref =~ s/[^a-z0-9 ]//g;
|
---|
112 | $$stringref =~ s/\s+/ /g;
|
---|
113 | $$stringref =~ s/^\s+//;
|
---|
114 | my @names = split / /, $$stringref;
|
---|
115 | my $surname = pop @names;
|
---|
116 | while (scalar @names && $surname =~ /^(jnr|snr)$/i) {
|
---|
117 | $surname = pop @names;
|
---|
118 | }
|
---|
119 | $$stringref = $surname . " " . $$stringref;
|
---|
120 | }
|
---|
121 |
|
---|
122 | # format an english string for sorting
|
---|
123 | # i.e. convert to lowercase, remove the, a or an
|
---|
124 | # from beginning of string etc.
|
---|
125 | sub format_string_english {
|
---|
126 | my $self = shift (@_);
|
---|
127 | my ($stringref) = @_;
|
---|
128 |
|
---|
129 | $$stringref =~ tr/A-Z/a-z/;
|
---|
130 | $$stringref =~ s/[^a-z0-9 ]//g;
|
---|
131 | $$stringref =~ s/^\s*(the|a|an)\b//;
|
---|
132 | $$stringref =~ s/^\s+//;
|
---|
133 | }
|
---|
134 |
|
---|
135 |
|
---|
136 | 1;
|
---|