Context Navigation

source: gsdl/trunk/perllib/mgbuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago
changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.6 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29	package mgbuildproc;
30
31
32	use basebuildproc;
33	use strict;
34
35
36	BEGIN {
37	@mgbuildproc::ISA = ('basebuildproc');
38	}
39
40	sub new {
41	my $class = shift @_;
42	my $self = new basebuildproc (@_);
43	return bless $self, $class;
44	}
45
46
47	sub find_paragraphs {
48	$_[1] =~ s/(<p\b)/\cC$1/gi;
49	}
50
51	sub text {
52	my $self = shift (@_);
53	my ($doc_obj) = @_;
54	my $handle = $self->{'output_handle'};
55
56	# only output this document if it is one to be indexed
57	return if ($doc_obj->get_doc_type() ne "indexed_doc");
58
59	# see if this document belongs to this subcollection
60	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
61
62	# this is another document
63	$self->{'num_docs'} += 1;
64
65	# get the parameters for the output
66	my ($level, $fields) = split (/:/, $self->{'index'});
67	$fields =~ s/\ball\b/Title,Creator,text/;
68	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
69
70	my $doc_section = 0; # just for this document
71	my $text = "";
72	my $text_extra = "";
73
74	# get the text for this document
75	my $section = $doc_obj->get_top_section();
76	while (defined $section) {
77	# update a few statistics
78	$doc_section++;
79	$self->{'num_sections'} += 1;
80
81	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
82	if (($indexed_doc) && ($indexed_section eq "indexed_section" \|\| $indexed_section eq "indexed_doc")) {
83	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
84	foreach my $field (split (/,/, $fields)) {
85	# only deal with this field if it doesn't start with top or
86	# this is the first section
87	my $real_field = $field;
88	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
89	my $new_text = "";
90	if ($level eq "dummy") {
91	# a dummy index is a special case used when no
92	# indexes are specified (since there must always be
93	# at least one index or we can't retrieve the
94	# compressed text) - we add a small amount of text
95	# to these dummy indexes which will never be seen
96	# but will overcome mg's problems with building
97	# empty indexes
98	$new_text = "this is dummy text to stop mg barfing";
99	$self->{'num_processed_bytes'} += length ($new_text);
100
101	} elsif ($real_field eq "text") {
102	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
103	$self->{'num_processed_bytes'} += length ($new_text);
104	$new_text =~ s/[\cB\cC]//g;
105	$self->find_paragraphs($new_text);
106
107	} else {
108	my $first = 1;
109	my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
110	if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
111	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
112	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
113	}
114	}
115	foreach my $meta (@section_metadata) {
116	$meta =~ s/[\cB\cC]//g;
117	$self->{'num_processed_bytes'} += length ($meta);
118	$new_text .= "\cC" unless $first;
119	$new_text .= $meta if $self->{'store_text'};
120	$first = 0;
121	}
122	}
123
124	# filter the text
125	$new_text = $self->filter_text ($field, $new_text);
126
127	$text .= "$new_text\cC";
128	}
129	}
130	}
131
132	if ($level eq "document") { $text_extra .= "\cB"; }
133	else { $text .= "\cB"; }
134
135	$section = $doc_obj->get_next_section($section);
136	}
137
138	print $handle "$text$text_extra";
139	}
140
141	1;
142

Note: See TracBrowser for help on using the repository browser.

Download in other formats: