root/gsdl/trunk/perllib/basebuildproc.pm @ 15695

Revision 15695, 17.8 KB (checked in by mdewsnip, 12 years ago)

(Adding new DB support) Starting to tidy up the infodb() function in preparation for separating out the GDBM stuff.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use doc;
35use docproc;
36use util;
37
38BEGIN {
39    @basebuildproc::ISA = ('docproc');
40}
41
42sub new()
43  {
44    my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
45    my $self = new docproc ();
46
47    # outhandle is where all the debugging info goes
48    # output_handle is where the output of the plugins is piped
49    # to (i.e. mg, database etc.)
50    $outhandle = STDERR unless defined $outhandle;
51
52    $self->{'collection'} = $collection;
53    $self->{'source_dir'} = $source_dir;
54    $self->{'build_dir'}  = $build_dir;
55    $self->{'keepold'}    = $keepold;
56    $self->{'verbosity'}  = $verbosity;
57    $self->{'outhandle'}  = $outhandle;
58
59    $self->{'classifiers'} = [];
60    $self->{'mode'} = "text";
61    $self->{'assocdir'} = $build_dir;
62    $self->{'dontdb'} = {};
63
64    $self->{'index'} = "section:text";
65    $self->{'indexexparr'} = [];
66
67    my $found_num_data = 0;
68    my $buildconfigfile = undef;
69
70    if ($keepold) {
71    # For incremental building need to seed num_docs etc from values
72    # stored in build.cfg (if present)
73      print STDERR "Keepold!\n";
74    $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
75      print STDERR "Build cfg: $buildconfigfile\n";
76    if (-e $buildconfigfile) {
77        $found_num_data = 1;
78    }
79    else {
80        # try the index dir
81        $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
82                           "index", "build.cfg");
83            print STDERR "Index cfg: $buildconfigfile\n";
84        if (-e $buildconfigfile) {
85        $found_num_data = 1;
86        }
87    }
88
89    }
90    #else
91    #  {
92    #    print STDERR "Removeold!\n";
93    #  }
94
95    if ($found_num_data)
96      {
97        #print STDERR "Found_Num_Data!\n";
98    my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99    $self->{'starting_num_docs'}     = $buildcfg->{'numdocs'};
100        #print STDERR "- num_docs:     $self->{'starting_num_docs'}\n";
101    $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102        #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103    $self->{'starting_num_bytes'}    = $buildcfg->{'numbytes'};
104        #print STDERR "- num_bytes:    $self->{'starting_num_bytes'}\n";
105    }
106    else
107      {
108        #print STDERR "NOT Found_Num_Data!\n";
109        $self->{'starting_num_docs'}     = 0;
110    $self->{'starting_num_sections'} = 0;
111    $self->{'starting_num_bytes'}    = 0;
112      }
113
114    $self->{'output_handle'} = "STDOUT";
115    $self->{'num_docs'}      = $self->{'starting_num_docs'};
116    $self->{'num_sections'}  = $self->{'starting_num_sections'};
117    $self->{'num_bytes'}     = $self->{'starting_num_bytes'};
118
119    $self->{'num_processed_bytes'} = 0;
120    $self->{'store_text'} = 1;
121
122    # what level (section/document) the database - indexer intersection is
123    $self->{'db_level'} = "section";
124    #used by browse interface
125    $self->{'doclist'} = [];
126
127    $self->{'indexing_text'} = 0;
128
129    return bless $self, $class;
130
131}
132
133sub reset {
134    my $self = shift (@_);
135
136    $self->{'num_docs'}      = $self->{'starting_num_docs'};
137    $self->{'num_sections'}  = $self->{'starting_num_sections'};
138    $self->{'num_bytes'}     = $self->{'starting_num_bytes'};
139   
140    $self->{'num_processed_bytes'} = 0;
141}
142
143sub zero_reset {
144    my $self = shift (@_);
145
146    $self->{'num_docs'}      = 0;
147    $self->{'num_sections'}  = 0;
148    $self->{'num_bytes'}     = 0;
149   
150    $self->{'num_processed_bytes'} = 0;
151}
152
153sub is_incremental_capable
154{
155    # By default we return 'no' as the answer
156    # Safer to assume non-incremental to start with, and then override in
157    # inherited classes that are.
158
159    return 0;
160}
161
162sub get_num_docs {
163    my $self = shift (@_);
164
165    return $self->{'num_docs'};
166}
167
168sub get_num_sections {
169    my $self = shift (@_);
170
171    return $self->{'num_sections'};
172}
173
174# num_bytes is the actual number of bytes in the collection
175# this is normally the same as what's processed during text compression
176sub get_num_bytes {
177    my $self = shift (@_);
178
179    return $self->{'num_bytes'};
180}
181
182# num_processed_bytes is the number of bytes actually passed
183# to mg for the current index
184sub get_num_processed_bytes {
185    my $self = shift (@_);
186
187    return $self->{'num_processed_bytes'};
188}
189
190sub set_output_handle {
191    my $self = shift (@_);
192    my ($handle) = @_;
193
194    $self->{'output_handle'} = $handle;
195}
196
197
198sub set_mode {
199    my $self = shift (@_);
200    my ($mode) = @_;
201
202    $self->{'mode'} = $mode;
203}
204
205sub get_mode {
206    my $self = shift (@_);
207
208    return $self->{'mode'};
209}
210
211sub set_assocdir {
212    my $self = shift (@_);
213    my ($assocdir) = @_;
214
215    $self->{'assocdir'} = $assocdir;
216}
217
218sub set_dontdb {
219    my $self = shift (@_);
220    my ($dontdb) = @_;
221
222    $self->{'dontdb'} = $dontdb;
223}
224
225sub set_index {
226    my $self = shift (@_);
227    my ($index, $indexexparr) = @_;
228
229    $self->{'index'} = $index;
230    $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
231}
232
233sub set_index_languages {
234    my $self = shift (@_);
235    my ($lang_meta, $langarr) = @_;
236    $self->{'lang_meta'} = $lang_meta;
237    $self->{'langarr'} = $langarr;
238}
239
240sub get_index {
241    my $self = shift (@_);
242
243    return $self->{'index'};
244}
245
246sub set_classifiers {
247    my $self = shift (@_);
248    my ($classifiers) = @_;
249
250    $self->{'classifiers'} = $classifiers;
251}
252
253sub set_indexing_text {
254    my $self = shift (@_);
255    my ($indexing_text) = @_;
256
257    $self->{'indexing_text'} = $indexing_text;
258}
259
260sub get_indexing_text {
261    my $self = shift (@_);
262
263    return $self->{'indexing_text'};
264}
265
266sub set_store_text {
267    my $self = shift (@_);
268    my ($store_text) = @_;
269
270    $self->{'store_text'} = $store_text;
271}
272sub get_doc_list {
273    my $self = shift(@_);
274   
275    return @{$self->{'doclist'}};
276}
277
278# the standard database level is section, but you may want to change it to document
279sub set_db_level {
280    my $self= shift (@_);
281    my ($db_level) = @_;
282
283    $self->{'db_level'} = $db_level;
284}
285
286sub set_sections_index_document_metadata {
287    my $self= shift (@_);
288    my ($index_type) = @_;
289   
290    $self->{'sections_index_document_metadata'} = $index_type;
291}
292sub process {
293    my $self = shift (@_);
294    my $method = $self->{'mode'};
295
296    $self->$method(@_);
297}
298
299
300
301sub infodb_metadata_stats
302{
303    my $self = shift (@_);
304    my ($field) = @_;
305
306    # Keep some statistics relating to metadata sets used and
307    # frequency of particular metadata fields within each set
308
309    # Union of metadata prefixes and frequency of fields
310    # (both scoped for this document alone, and across whole collection)
311   
312    if ($field =~ m/^(.+)\.(.*)$/) {
313    my $prefix = $1;
314    my $core_field = $2;
315
316    $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
317    $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
318    }
319    elsif ($field =~ m/^[[:upper:]]/) {
320    # implicit 'ex' metadata set
321
322    $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
323    $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
324    }
325
326}
327
328
329sub infodb {
330    my $self = shift (@_);
331    my ($doc_obj, $filename) = @_;
332    my $handle = $self->{'output_handle'};
333
334    my $doctype = $doc_obj->get_doc_type();
335
336    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
337    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
338
339    my $archivedir = "";
340
341    if (defined $filename)
342    {
343    # doc_obj derived directly from file
344
345    my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
346    $dir = "" unless defined $dir;
347    $dir =~ s/\\/\//g;
348    $dir =~ s/^\/+//;
349    $dir =~ s/\/+$//;
350
351    $archivedir = $dir;
352
353    # resolve the final filenames of the files associated with this document
354    $self->assoc_files ($doc_obj, $archivedir);
355    }
356    else
357    {
358    # doc_obj reconstructed from database (has metadata, doc structure but no text)
359    my $top_section = $doc_obj->get_top_section();
360    $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
361    }
362
363
364    #GRB: moved 1/06/2004 from GRB01062004
365    #add this document to the browse structure
366    push(@{$self->{'doclist'}},$doc_obj->get_OID())
367    unless ($doctype eq "classification");
368
369    # classify this document
370    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
371    #GRB: end of moved block
372
373    # this is another document
374    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
375
376    # is this a paged or a hierarchical document
377    my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
378
379    my $section = $doc_obj->get_top_section ();
380    my $doc_OID = $doc_obj->get_OID();
381    my $first = 1;
382    my $url = "";
383
384    $self->{'doc_mdprefix_fields'} = {};
385
386    while (defined $section)
387    {
388    my $section_OID = $doc_OID;
389    if ($section ne "")
390    {
391        $section_OID = $doc_OID . "." . $section;
392    }
393
394    # update a few statistics
395    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
396    $self->{'num_sections'} += 1 unless ($doctype eq "classification");
397
398    # output the section name
399    print $handle "[$section_OID]\n";
400
401    # output the fact that this document is a document (unless doctype
402    # has been set to something else from within a plugin
403    my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
404    if (!defined $dtype || $dtype !~ /\w/) {
405        print $handle "<doctype>doc\n";
406    }
407
408    # Output whether this node contains text
409    #
410    # If doc_obj reconstructed from database file then no need to
411    # explicitly add <hastxt> as this is preserved as metadata when
412    # the database file is loaded in
413
414    if (defined $filename)
415    {
416        # doc_obj derived directly from file
417        if ($doc_obj->get_text_length($section) > 0) {
418        print $handle "<hastxt>1\n";
419        } else {
420        print $handle "<hastxt>0\n";
421        }
422    }
423
424    # output all the section metadata
425    my $metadata = $doc_obj->get_all_metadata ($section);
426    foreach my $pair (@$metadata) {
427        my ($field, $value) = (@$pair);
428
429        if ($field ne "Identifier" && $field !~ /^gsdl/ &&
430        defined $value && $value ne "") {       
431
432        # escape problematic stuff
433        $value =~ s/\\/\\\\/g;
434        $value =~ s/\n/\\n/g;
435        $value =~ s/\r/\\r/g;
436        if ($value =~ /-{70,}/) {
437            # if value contains 70 or more hyphens in a row we need
438            # to escape them to prevent txt2db from treating them
439            # as a separator
440            $value =~ s/-/&\#045;/gi;
441        }
442
443        # special case for URL metadata
444        if ($field =~ /^URL$/i) {
445                    $url .= "[$value]\n";
446            $url .= "<section>$section_OID\n";
447                    $url .= '-' x 70 . "\n";
448        }
449
450        if (!defined $self->{'dontdb'}->{$field}) {
451            print $handle "<$field>$value\n";
452
453            if ($section eq "")
454            {
455            $self->infodb_metadata_stats($field);
456            }
457        }
458        }
459    }
460
461    if ($section eq "")
462    {
463        my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
464
465        foreach my $prefix (keys %$doc_mdprefix_fields)
466        {
467        print $handle "<metadataset>$prefix\n";
468
469        foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
470        {
471            my $val = $doc_mdprefix_fields->{$prefix}->{$field};
472
473            print $handle "<metadatalist-$prefix>$field\n";
474            print $handle "<metadatafreq-$prefix-$field>$val\n";
475        }
476
477        }
478    }
479
480    # If doc_obj reconstructed from database file then no need to
481    # explicitly add <archivedir> as this is preserved as metadata when
482    # the database file is loaded in
483
484    if (defined $filename)
485    {
486        # output archivedir if at top level
487        if ($section eq $doc_obj->get_top_section()) {
488        print $handle "<archivedir>$archivedir\n";
489        }
490    }
491
492    # output document display type
493    if ($first) {
494        print $handle "<thistype>$thistype\n";
495    }
496
497
498    if ($self->{'db_level'} eq "document") {
499        # doc num is num_docs not num_sections
500        # output the matching document number
501        print $handle "<docnum>$self->{'num_docs'}\n";
502           
503    } else {
504        # output a list of children
505        my $children = $doc_obj->get_children ($section);
506        if (scalar(@$children) > 0) {
507        print $handle "<childtype>$childtype\n";
508        print $handle "<contains>";
509        my $firstchild = 1;
510        foreach my $child (@$children) {
511            print $handle ";" unless $firstchild;
512            $firstchild = 0;
513            if ($child =~ /^.*?\.(\d+)$/) {
514            print $handle "\".$1";
515            } else {
516            print $handle "\".$child";
517            }
518        }
519        print $handle "\n";
520        }
521        #output the matching doc number
522        print $handle "<docnum>$self->{'num_sections'}\n";
523       
524    }
525   
526    print $handle '-' x 70, "\n";
527
528   
529    # output a database entry for the document number
530    if ($self->{'db_level'} eq "document") {
531        print $handle "[$self->{'num_docs'}]\n";
532        print $handle "<section>$doc_OID\n";
533    }
534    else {
535        print $handle "[$self->{'num_sections'}]\n";
536        print $handle "<section>$section_OID\n";
537    }
538    print $handle '-' x 70, "\n";
539   
540        # output entry for url
541        if ($url ne "") {
542            print $handle $url;
543        }
544
545    $first = 0;
546    $section = $doc_obj->get_next_section($section);
547    last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
548    }
549
550    #GRB01062004: see code above moved from here
551}
552
553
554sub text {
555    my $self = shift (@_);
556    my ($doc_obj) = @_;
557   
558    my $handle = $self->{'outhandle'};
559    print $handle "basebuildproc::text function must be implemented in sub classes\n";
560    die "\n";
561}
562
563# should the document be indexed - according to the subcollection and language
564# specification.
565sub is_subcollection_doc {
566    my $self = shift (@_);
567    my ($doc_obj) = @_;
568   
569    my $indexed_doc = 1;
570    foreach my $indexexp (@{$self->{'indexexparr'}}) {
571    $indexed_doc = 0;
572    my ($field, $exp, $options) = split /\//, $indexexp;
573    if (defined ($field) && defined ($exp)) {
574        my ($bool) = $field =~ /^(.)/;
575        $field =~ s/^.// if $bool eq '!';
576        my @metadata_values;
577        if ($field =~ /^filename$/i) {
578        push(@metadata_values, $doc_obj->get_source_filename());
579        }
580        else {
581        @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
582        }
583        next unless @metadata_values;
584        foreach my $metadata_value (@metadata_values) {
585        if ($bool eq '!') {
586            if ($options =~ /^i$/i) {
587            if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
588            } else {
589            if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
590            }
591        } else {
592            if ($options =~ /^i$/i) {
593            if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
594            } else {
595            if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
596            }
597        }
598        }
599
600        last if ($indexed_doc == 1);
601    }
602    }
603   
604    # if this doc is so far in the sub collection, and we have lang info,
605    # now we check the languages to see if it matches
606    if($indexed_doc && defined $self->{'lang_meta'}) {
607    $indexed_doc = 0;
608    my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
609    if (defined $field) {
610        foreach my $lang (@{$self->{'langarr'}}) {
611        my ($bool) = $lang =~ /^(.)/;
612        if ($bool eq '!') {
613            $lang =~ s/^.//;
614            if ($field !~ /$lang/) {
615            $indexed_doc = 1; last;
616            }
617        } else {
618            if ($field =~ /$lang/) {
619            $indexed_doc = 1; last;
620            }
621        }
622        }
623    }
624    }
625    return $indexed_doc;
626   
627}
628
629# use 'Paged' if document has no more than 2 levels
630# and each section at second level has a number for
631# Title metadata
632# also use Paged if gsdlthistype metadata is set to Paged
633sub get_document_type {
634    my $self = shift (@_);
635    my ($doc_obj) = @_;
636
637    my $thistype = "VList";
638    my $childtype = "VList";
639    my $title;
640    my @tmp = ();
641   
642    my $section = $doc_obj->get_top_section ();
643   
644    my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
645    if (defined $gsdlthistype) {
646    if ($gsdlthistype eq "Paged") {
647        $childtype = "Paged";
648        if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
649        $thistype = "Paged";
650        } else {
651        $thistype = "Invisible";
652        }
653       
654        return ($thistype, $childtype);
655    } elsif ($gsdlthistype eq "Hierarchy") {
656        return ($thistype, $childtype); # use VList, VList
657    }
658    }
659    my $first = 1;
660    while (defined $section) {
661    @tmp = split /\./, $section;
662    if (scalar(@tmp) > 1) {
663        return ($thistype, $childtype);
664    }
665    if (!$first) {
666        $title = $doc_obj->get_metadata_element ($section, "Title");
667        if (!defined $title || $title !~ /^\d+$/) {
668        return ($thistype, $childtype);
669        }
670    }
671    $first = 0;
672    $section = $doc_obj->get_next_section($section);
673    }
674    if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
675    $thistype = "Paged";
676    } else {
677    $thistype = "Invisible";
678    }
679    $childtype = "Paged";
680    return ($thistype, $childtype);
681}
682
683sub assoc_files() {
684    my $self = shift (@_);
685    my ($doc_obj, $archivedir) = @_;
686    my ($afile);
687   
688    foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
689      #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
690    # if assoc file starts with a slash, we put it relative to the assoc
691    # dir, otherwise it is relative to the HASH... directory
692    if ($assoc_file->[1] =~ m@^[/\\]@) {
693        $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
694    } else {
695        $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
696    }
697    &util::hard_link ($assoc_file->[0], $afile);
698    }
699}
700
Note: See TracBrowser for help on using the browser.