Context Navigation

source: trunk/gsdl/src/phind/generate/phproc.pm@ 1631

Last change on this file since 1631 was 1631, checked in by paynter, 24 years ago
Changed copyrights to include NZDLP.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.0 KB

Line
1	###########################################################################
2	#
3	# phproc.pm -- the Phind document processor
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright 2000 The New Zealand Digital Library Project
7	#
8	# A component of the Greenstone digital library software
9	# from the New Zealand Digital Library Project at the
10	# University of Waikato, New Zealand.
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# This is the phind document processor object. It is used by the document
29	# reader plugins to extract the clauses from each document.
30
31	package phproc;
32
33	use docproc;
34	use util;
35
36	sub BEGIN {
37	@ISA = ('docproc');
38	}
39
40	sub new {
41	my ($class, $archive_dir, $phindex_dir, $phindcfg,
42	$language, $delimiter, $verbosity, $outhandle) = @_;
43
44	my $self = new docproc ();
45
46	$self->{'archive_dir'} = $archive_dir;
47	$self->{'phindex_dir'} = $phindex_dir;
48	$self->{'indexes'} = $phindcfg;
49
50	$language =~ s/,/\\|/g;
51	$self->{'language_exp'} = $language;
52	$self->{'delimiter'} = $delimiter;
53
54	$self->{'verbosity'} = $verbosity;
55	$self->{'outhandle'} = STDERR;
56	$self->{'outhandle'} = $outhandle if defined $outhandle;
57
58	&util::rm("$phindex_dir/clauses") if (-e "$phindex_dir/clauses");
59	open(TEXT, ">$phindex_dir/clauses")
60	\|\| die "Cannot open $phindex_dir/clauses: $!";
61	$self->{'txthandle'} = TEXT;
62
63	my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
64	&util::rm($docfile) if (-e $docfile);
65	open(DOCS, ">$docfile")
66	\|\| die "Cannot open $docfile: $!";
67	$self->{'dochandle'} = DOCS;
68
69	return bless $self, $class;
70
71	}
72
73	sub process {
74	my $self = shift (@_);
75	my ($doc_obj) = @_;
76
77	my $verbosity = $self->{'verbosity'};
78	my $top_section = $doc_obj->get_top_section();
79
80	my $title = $doc_obj->get_metadata_element ($top_section, "Title");
81	print "process: $title\n" if ($verbosity > 2);
82
83
84	# only consider english-language files
85	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
86	my $phrlanguage = $self->{'language_exp'};
87	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
88
89	# record this file
90	my $total++;
91	print "file $total: $file\n" if ($self->{'$verbosity'});
92
93	# store object ID & title in document index file
94	my $OID = $doc_obj->get_OID();
95	$OID = "NULL" unless defined $OID;
96
97	my $dochandle = $self->{'dochandle'};
98	# print "dochandle: =$dochandle=\n";
99	print $dochandle "<Document>\t$OID\t$title\n";
100
101	# Store the text of this object
102	my $indexlist = $self->{'indexes'};
103	my @parts;
104	my ($index, $part, $level, $field, $section, $data, $text);
105
106	# Output the document delimiter
107	my $txthandle = $self->{'txthandle'};
108	print $txthandle $self->{'delimiter'}, "\n";
109
110	# Iterarate over all the indexes specified in collect.cfg and
111	# add their text to the clauses file.
112	foreach $index (@$indexlist) {
113	$text = "";
114
115	# Iterate over all the feilds in each index
116	@parts = split(/,/, $index);
117	foreach $part (@parts) {
118
119	# Each field has a level and a data element ((e.g. document:Title)
120	($level, $field) = split(/:/, $part);
121	die unless ($level && $field);
122
123	# Extract the text from every section
124	# (In phind, document:text and section:text are equivalent)
125	if ($field eq "text") {
126	$data = "";
127	$section = $doc_obj->get_top_section();
128	while (defined($section)) {
129	$data .= $doc_obj->get_text($section) . "\n";
130	$section = $doc_obj->get_next_section($section);
131	}
132	$text .= convert_gml_to_tokens($data) . "\n";
133	}
134
135	# Extract a metadata field from a document
136	elsif ($level eq "document") {
137	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
138	$text .= convert_gml_to_tokens($data) . "\n";
139	}
140
141	# Extract metadata from every section in a document
142	elsif ($level eq "section") {
143	$data = "";
144	$section = $doc_obj->get_top_section();
145	while (defined($section)) {
146	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
147	$section = $doc_obj->get_next_section($section);
148	}
149	$text .= convert_gml_to_tokens($data) . "\n";
150	}
151
152	# Some sort of specification which I don't understand
153	else {
154	die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
155	}
156
157	}
158
159	# print the text
160	print $txthandle "$text";
161
162	}
163	}
164
165
166
167	sub convert_gml_to_tokens {
168
169	$_ = shift @_;
170
171	# FIRST, remove GML tags
172
173	# Replace all whitespace with a simple space
174	s/\s+/ /gs;
175
176	# Remove everything that is in a tag
177	s/\s<p>\s/ PARAGRAPHBREAK /isg;
178	s/\s<br>\s/ LINEBREAK /isg;
179	s/<[^>]*>/ /sg;
180
181	# Now we have the text, but it may contain HTML
182	# elements coded as > etc. Remove these tags.
183	s/</</sg;
184	s/>/>/sg;
185
186	s/\s+/ /sg;
187	s/\s<p>\s/ PARAGRAPHBREAK /isg;
188	s/\s<br>\s/ LINEBREAK /isg;
189	s/<[^>]*>/ /sg;
190
191	# remove & and other miscellaneous markup tags
192	s/&/&/sg;
193	s/</</sg;
194	s/>/>/sg;
195	s/&/&/sg;
196
197	# replace<p> and <br> placeholders with carriage returns
198	s/PARAGRAPHBREAK/\n/sg;
199	s/LINEBREAK/\n/sg;
200
201
202	# Exceptional punctuation
203	#
204	# We make special cases of some punctuation
205
206	# remove any apostrophe that indicates omitted letters
207	s/(\w+)\'(\w*\s)/ $1$2 /g;
208
209	# remove period that appears in a person's initals
210	s/\s([A-Z])\./ $1 /g;
211
212	# replace hyphens in hypheanted words and names with a space
213	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
214
215
216	# Convert the remaining text to "clause format",
217	# This means removing all excess punctuation and garbage text,
218	# normalising valid punctuation to fullstops and commas,
219	# then putting one cluse on each line.
220
221	# Insert newline when the end of a sentence is detected
222	# (delimter is: "[\.\?\!]\s")
223	s/\s*[\.\?\!]\s+/\n/g;
224
225	# split numbers after four digits
226	s/(\d\d\d\d)/$1 /g;
227
228	# split words after 32 characters
229
230	# squash repeated punctuation
231	tr/A-Za-z0-9 //cs;
232
233	# save email addresses
234	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
235
236	# normalise clause breaks (mostly punctuation symbols) to commas
237	s/[^A-Za-z0-9 \n]+/ , /g;
238
239	# Remove repeated commas, and replace with newline
240	s/\s*,[, ]+/\n/g;
241
242	# remove extra whitespace
243	s/ +/ /sg;
244	s/^\s+//mg;
245	s/\s*$/\n/mg;
246
247	# remove lines that contain one word or less
248	s/^\w*$//mg;
249	s/^\s*$//mg;
250	tr/\n//s;
251
252	return $_;
253	}
254
255
256
257	1;
258

Note: See TracBrowser for help on using the repository browser.

Download in other formats: