1 | # classifier plugin for generating hierarchical classifications
|
---|
2 |
|
---|
3 | # options for this classifier are:
|
---|
4 | # file.txt - classification file
|
---|
5 | # Metaname - metadata field to test against file.txt
|
---|
6 | # sort=Meta - this option is optional (genious;-). by default this
|
---|
7 | # classifier will sort documents within each section
|
---|
8 | # alphabetically by Title. sort=nosort prevents sorting
|
---|
9 | # (i.e. documents will end up in build order), sort=Meta
|
---|
10 | # will sort each field alphabetically by Meta (Meta may
|
---|
11 | # also be 'Filename' to sort by the original filename).
|
---|
12 |
|
---|
13 |
|
---|
14 | package Hierarchy;
|
---|
15 |
|
---|
16 | use util;
|
---|
17 | use cfgread;
|
---|
18 | use sorttools;
|
---|
19 |
|
---|
20 | sub new {
|
---|
21 | my ($class, @options) = @_;
|
---|
22 |
|
---|
23 | if (!defined @options || (scalar @options < 2)) {
|
---|
24 | die "Error in options passed to Hierarchy classification\n" .
|
---|
25 | "Usage: Hierarchy subjectfile metadata\n";
|
---|
26 | }
|
---|
27 | my $sortname = "Title";
|
---|
28 | if ((scalar @options > 2) && $options[2] =~ /^sort=(.*)$/i) {
|
---|
29 | $sortname = $1;
|
---|
30 | $sortname = undef if $sortname =~ /^nosort$/;
|
---|
31 | }
|
---|
32 |
|
---|
33 | my $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $options[0]);
|
---|
34 | if (!-e $subjectfile) {
|
---|
35 | my $collfile = $subjectfile;
|
---|
36 | $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $options[0]);
|
---|
37 | if (!-e $subjectfile) {
|
---|
38 | die "Hierarchy Error: Can't locate subject file $options[0]\n" .
|
---|
39 | "This file should be in $collfile or $subjectfile\n";
|
---|
40 | }
|
---|
41 | }
|
---|
42 |
|
---|
43 | return bless {
|
---|
44 | 'descriptorlist'=>{}, # first field in subject file
|
---|
45 | 'locatorlist'=>{}, # second field in subject file
|
---|
46 | 'subjectfile' => $subjectfile,
|
---|
47 | 'metaname' => $options[1],
|
---|
48 | 'sortname' => $sortname
|
---|
49 | }, $class;
|
---|
50 | }
|
---|
51 |
|
---|
52 | sub init {
|
---|
53 | my $self = shift (@_);
|
---|
54 |
|
---|
55 | # read in the subject file
|
---|
56 | my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\w');
|
---|
57 |
|
---|
58 | # $list is a hash that is indexed by the descriptor. The contents of this
|
---|
59 | # hash is a list of two items. The first item is the OID and the second item
|
---|
60 | # is the title
|
---|
61 | foreach $descriptor (keys (%$list)) {
|
---|
62 | $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
|
---|
63 | unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
|
---|
64 | $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
|
---|
65 | $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
|
---|
66 | }
|
---|
67 | }
|
---|
68 | }
|
---|
69 |
|
---|
70 | sub classify {
|
---|
71 | my $self = shift (@_);
|
---|
72 | my ($doc_obj) = @_;
|
---|
73 |
|
---|
74 | my $doc_OID = $doc_obj->get_OID();
|
---|
75 |
|
---|
76 | my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(),
|
---|
77 | $self->{'metaname'});
|
---|
78 |
|
---|
79 | foreach $metaelement (@$metadata) {
|
---|
80 | if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
|
---|
81 | (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
|
---|
82 |
|
---|
83 | my $sortmeta = "";
|
---|
84 | if (defined $self->{'sortname'}) {
|
---|
85 | if ($self->{'sortname'} =~ /^filename$/i) {
|
---|
86 | $sortmeta = $doc_obj->get_source_filename();
|
---|
87 | } else {
|
---|
88 | $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
|
---|
89 | $self->{'sortname'});
|
---|
90 | if ($self-{'sortname'} eq "Creator") {
|
---|
91 | &sorttools::format_string_name_english ($sortmeta);
|
---|
92 | } else {
|
---|
93 | &sorttools::format_string_english ($sortmeta);
|
---|
94 | }
|
---|
95 | }
|
---|
96 | $sortmeta = "" unless defined $sortmeta;
|
---|
97 | }
|
---|
98 |
|
---|
99 | push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
|
---|
100 | [$doc_OID, $sortmeta]);
|
---|
101 | }
|
---|
102 | }
|
---|
103 | }
|
---|
104 |
|
---|
105 |
|
---|
106 | sub get_classify_info {
|
---|
107 | my $self = shift (@_);
|
---|
108 |
|
---|
109 | my $list = $self->{'locatorlist'};
|
---|
110 |
|
---|
111 | my $classifyinfo = $self->get_entry ($self->{'metaname'}, "Hierarchy");
|
---|
112 | foreach $OID (keys (%$list)) {
|
---|
113 | my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "Hierarchy");
|
---|
114 |
|
---|
115 | if (defined $self->{'sortname'}) {
|
---|
116 | foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
|
---|
117 | push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
|
---|
118 | }
|
---|
119 | } else {
|
---|
120 | foreach $subOID (@{$list->{$OID}->{'contents'}}) {
|
---|
121 | push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
|
---|
122 | }
|
---|
123 | }
|
---|
124 | }
|
---|
125 |
|
---|
126 | return $classifyinfo;
|
---|
127 | }
|
---|
128 |
|
---|
129 |
|
---|
130 | sub get_OID_entry {
|
---|
131 | my $self = shift (@_);
|
---|
132 | my ($OID, $classifyinfo, $title, $classifytype) = @_;
|
---|
133 |
|
---|
134 | $OID = "" unless defined $OID;
|
---|
135 | $OID =~ s/^\.+//;
|
---|
136 |
|
---|
137 | my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/;
|
---|
138 | $tailOID = "" unless defined $tailOID;
|
---|
139 |
|
---|
140 |
|
---|
141 | if (!defined $headOID) {
|
---|
142 | $classifyinfo->{'Title'} = $title;
|
---|
143 | $classifyinfo->{'classifytype'} = $classifytype;
|
---|
144 | return $classifyinfo;
|
---|
145 | }
|
---|
146 |
|
---|
147 | $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
|
---|
148 |
|
---|
149 | # add entries to the contains list until we have one for headOID
|
---|
150 | while (scalar(@{$classifyinfo->{'contains'}}) < $headOID) {
|
---|
151 | push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
|
---|
152 | }
|
---|
153 |
|
---|
154 | return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID-1)], $title, $classifytype);
|
---|
155 | }
|
---|
156 |
|
---|
157 | sub get_entry {
|
---|
158 | my $self = shift (@_);
|
---|
159 | my ($title, $classifytype) = @_;
|
---|
160 |
|
---|
161 | # organise into classification structure
|
---|
162 | my %classifyinfo = ('classifytype'=>$classifytype,
|
---|
163 | 'Title'=>$title,
|
---|
164 | 'contains'=>[]);
|
---|
165 |
|
---|
166 | return \%classifyinfo;
|
---|
167 | }
|
---|
168 |
|
---|
169 |
|
---|
170 |
|
---|
171 | 1;
|
---|