[231] | 1 | # classifier plugin for generating hierarchical classifications
|
---|
| 2 |
|
---|
[384] | 3 | # options for this classifier are:
|
---|
| 4 | # file.txt - classification file
|
---|
| 5 | # Metaname - metadata field to test against file.txt
|
---|
| 6 | # sort=Meta - this option is optional (genious;-). by default this
|
---|
| 7 | # classifier will sort documents within each section
|
---|
| 8 | # alphabetically by Title. sort=nosort prevents sorting
|
---|
| 9 | # (i.e. documents will end up in build order), sort=Meta
|
---|
| 10 | # will sort each field alphabetically by Meta (Meta may
|
---|
| 11 | # also be 'Filename' to sort by the original filename).
|
---|
| 12 |
|
---|
| 13 |
|
---|
[231] | 14 | package Hierarchy;
|
---|
| 15 |
|
---|
| 16 | use util;
|
---|
| 17 | use cfgread;
|
---|
[384] | 18 | use sorttools;
|
---|
[231] | 19 |
|
---|
| 20 | sub new {
|
---|
| 21 | my ($class, @options) = @_;
|
---|
| 22 |
|
---|
| 23 | if (!defined @options || (scalar @options < 2)) {
|
---|
| 24 | die "Error in options passed to Hierarchy classification\n" .
|
---|
| 25 | "Usage: Hierarchy subjectfile metadata\n";
|
---|
| 26 | }
|
---|
[384] | 27 | my $sortname = "Title";
|
---|
| 28 | if ((scalar @options > 2) && $options[2] =~ /^sort=(.*)$/i) {
|
---|
| 29 | $sortname = $1;
|
---|
| 30 | $sortname = undef if $sortname =~ /^nosort$/;
|
---|
| 31 | }
|
---|
| 32 |
|
---|
[231] | 33 | my $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $options[0]);
|
---|
| 34 | if (!-e $subjectfile) {
|
---|
| 35 | my $collfile = $subjectfile;
|
---|
| 36 | $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $options[0]);
|
---|
| 37 | if (!-e $subjectfile) {
|
---|
[342] | 38 | die "Hierarchy Error: Can't locate subject file $options[0]\n" .
|
---|
[231] | 39 | "This file should be in $collfile or $subjectfile\n";
|
---|
| 40 | }
|
---|
| 41 | }
|
---|
| 42 |
|
---|
| 43 | return bless {
|
---|
[316] | 44 | 'descriptorlist'=>{}, # first field in subject file
|
---|
| 45 | 'locatorlist'=>{}, # second field in subject file
|
---|
[231] | 46 | 'subjectfile' => $subjectfile,
|
---|
[384] | 47 | 'metaname' => $options[1],
|
---|
| 48 | 'sortname' => $sortname
|
---|
[231] | 49 | }, $class;
|
---|
| 50 | }
|
---|
| 51 |
|
---|
| 52 | sub init {
|
---|
| 53 | my $self = shift (@_);
|
---|
| 54 |
|
---|
| 55 | # read in the subject file
|
---|
[253] | 56 | my $list = &cfgread::read_cfg_file ($self->{'subjectfile'}, undef, '^[^#]?\w');
|
---|
[231] | 57 |
|
---|
[316] | 58 | # $list is a hash that is indexed by the descriptor. The contents of this
|
---|
| 59 | # hash is a list of two items. The first item is the OID and the second item
|
---|
| 60 | # is the title
|
---|
[231] | 61 | foreach $descriptor (keys (%$list)) {
|
---|
| 62 | $self->{'descriptorlist'}->{$descriptor} = $list->{$descriptor}->[0];
|
---|
| 63 | unless (defined $self->{'locatorlist'}->{$list->{$descriptor}->[0]}) {
|
---|
| 64 | $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'title'} = $list->{$descriptor}->[1];
|
---|
| 65 | $self->{'locatorlist'}->{$list->{$descriptor}->[0]}->{'contents'} = [];
|
---|
| 66 | }
|
---|
| 67 | }
|
---|
| 68 | }
|
---|
| 69 |
|
---|
| 70 | sub classify {
|
---|
| 71 | my $self = shift (@_);
|
---|
| 72 | my ($doc_obj) = @_;
|
---|
| 73 |
|
---|
| 74 | my $doc_OID = $doc_obj->get_OID();
|
---|
[342] | 75 |
|
---|
[231] | 76 | my $metadata = $doc_obj->get_metadata ($doc_obj->get_top_section(),
|
---|
| 77 | $self->{'metaname'});
|
---|
| 78 |
|
---|
| 79 | foreach $metaelement (@$metadata) {
|
---|
| 80 | if ((defined $self->{'descriptorlist'}->{$metaelement}) &&
|
---|
| 81 | (defined $self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}})) {
|
---|
[384] | 82 |
|
---|
| 83 | my $sortmeta = "";
|
---|
| 84 | if (defined $self->{'sortname'}) {
|
---|
| 85 | if ($self->{'sortname'} =~ /^filename$/i) {
|
---|
| 86 | $sortmeta = $doc_obj->get_source_filename();
|
---|
| 87 | } else {
|
---|
| 88 | $sortmeta = $doc_obj->get_metadata_element($doc_obj->get_top_section(),
|
---|
| 89 | $self->{'sortname'});
|
---|
| 90 | if ($self-{'sortname'} eq "Creator") {
|
---|
| 91 | &sorttools::format_string_name_english ($sortmeta);
|
---|
| 92 | } else {
|
---|
| 93 | &sorttools::format_string_english ($sortmeta);
|
---|
| 94 | }
|
---|
| 95 | }
|
---|
| 96 | $sortmeta = "" unless defined $sortmeta;
|
---|
| 97 | }
|
---|
| 98 |
|
---|
| 99 | push (@{$self->{'locatorlist'}->{$self->{'descriptorlist'}->{$metaelement}}->{'contents'}},
|
---|
| 100 | [$doc_OID, $sortmeta]);
|
---|
[231] | 101 | }
|
---|
| 102 | }
|
---|
| 103 | }
|
---|
| 104 |
|
---|
[316] | 105 |
|
---|
| 106 | sub get_classify_info {
|
---|
[231] | 107 | my $self = shift (@_);
|
---|
[316] | 108 |
|
---|
[231] | 109 | my $list = $self->{'locatorlist'};
|
---|
| 110 |
|
---|
[316] | 111 | my $classifyinfo = $self->get_entry ($self->{'metaname'}, "Hierarchy");
|
---|
[231] | 112 | foreach $OID (keys (%$list)) {
|
---|
[316] | 113 | my $tempinfo = $self->get_OID_entry ($OID, $classifyinfo, $list->{$OID}->{'title'}, "Hierarchy");
|
---|
[384] | 114 |
|
---|
| 115 | if (defined $self->{'sortname'}) {
|
---|
| 116 | foreach $subOID (sort {$a->[1] cmp $b->[1];} @{$list->{$OID}->{'contents'}}) {
|
---|
| 117 | push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
|
---|
| 118 | }
|
---|
| 119 | } else {
|
---|
| 120 | foreach $subOID (@{$list->{$OID}->{'contents'}}) {
|
---|
| 121 | push (@{$tempinfo->{'contains'}}, {'OID'=>$subOID->[0]});
|
---|
| 122 | }
|
---|
[316] | 123 | }
|
---|
[231] | 124 | }
|
---|
[384] | 125 |
|
---|
[316] | 126 | return $classifyinfo;
|
---|
[231] | 127 | }
|
---|
| 128 |
|
---|
[316] | 129 |
|
---|
| 130 | sub get_OID_entry {
|
---|
[231] | 131 | my $self = shift (@_);
|
---|
[316] | 132 | my ($OID, $classifyinfo, $title, $classifytype) = @_;
|
---|
| 133 |
|
---|
| 134 | $OID = "" unless defined $OID;
|
---|
| 135 | $OID =~ s/^\.+//;
|
---|
[231] | 136 |
|
---|
[316] | 137 | my ($headOID, $tailOID) = $OID =~ /^(\d+)(.*)$/;
|
---|
| 138 | $tailOID = "" unless defined $tailOID;
|
---|
| 139 |
|
---|
| 140 |
|
---|
| 141 | if (!defined $headOID) {
|
---|
| 142 | $classifyinfo->{'Title'} = $title;
|
---|
| 143 | $classifyinfo->{'classifytype'} = $classifytype;
|
---|
| 144 | return $classifyinfo;
|
---|
[231] | 145 | }
|
---|
[316] | 146 |
|
---|
| 147 | $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'};
|
---|
| 148 |
|
---|
| 149 | # add entries to the contains list until we have one for headOID
|
---|
| 150 | while (scalar(@{$classifyinfo->{'contains'}}) < $headOID) {
|
---|
| 151 | push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype));
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | return $self->get_OID_entry ($tailOID, $classifyinfo->{'contains'}->[($headOID-1)], $title, $classifytype);
|
---|
[231] | 155 | }
|
---|
| 156 |
|
---|
[316] | 157 | sub get_entry {
|
---|
| 158 | my $self = shift (@_);
|
---|
| 159 | my ($title, $classifytype) = @_;
|
---|
| 160 |
|
---|
| 161 | # organise into classification structure
|
---|
| 162 | my %classifyinfo = ('classifytype'=>$classifytype,
|
---|
| 163 | 'Title'=>$title,
|
---|
| 164 | 'contains'=>[]);
|
---|
[231] | 165 |
|
---|
[316] | 166 | return \%classifyinfo;
|
---|
| 167 | }
|
---|
| 168 |
|
---|
[384] | 169 |
|
---|
| 170 |
|
---|
[231] | 171 | 1;
|
---|