Context Navigation

source: trunk/gsdl/src/phind/generate/phproc.pm@ 1604

Last change on this file since 1604 was 1604, checked in by paynter, 24 years ago
Numerous improvements for use with the new phindcgi script. The main ones are that three MGPP databases are now created (document data, phrase data, and word search) and that the data extracted from each document is set explicitly in the collection configuration file (usually it will be something like document:text or section:Title).
Property svn:keywords set to `Author Date Id Revision`
File size: 7.0 KB

Line
1	###########################################################################
2	#
3	# phproc.pm -- the Phind document processor
4	#
5	# Copyright (C) 2000 Gordon Paynter
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# This is the phind document processor object. It is used by the document
28	# reader plugins to extract the clauses from each document.
29
30	package phproc;
31
32	use docproc;
33	use util;
34
35	sub BEGIN {
36	@ISA = ('docproc');
37	}
38
39	sub new {
40	my ($class, $archive_dir, $phindex_dir, $phindcfg,
41	$language, $delimiter, $verbosity, $outhandle) = @_;
42
43	my $self = new docproc ();
44
45	$self->{'archive_dir'} = $archive_dir;
46	$self->{'phindex_dir'} = $phindex_dir;
47	$self->{'indexes'} = $phindcfg;
48
49	$language =~ s/,/\\|/g;
50	$self->{'language_exp'} = $language;
51	$self->{'delimiter'} = $delimiter;
52
53	$self->{'verbosity'} = $verbosity;
54	$self->{'outhandle'} = STDERR;
55	$self->{'outhandle'} = $outhandle if defined $outhandle;
56
57	&util::rm("$phindex_dir/clauses") if (-e "$phindex_dir/clauses");
58	open(TEXT, ">$phindex_dir/clauses")
59	\|\| die "Cannot open $phindex_dir/clauses: $!";
60	$self->{'txthandle'} = TEXT;
61
62	my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
63	&util::rm($docfile) if (-e $docfile);
64	open(DOCS, ">$docfile")
65	\|\| die "Cannot open $docfile: $!";
66	$self->{'dochandle'} = DOCS;
67
68	return bless $self, $class;
69
70	}
71
72	sub process {
73	my $self = shift (@_);
74	my ($doc_obj) = @_;
75
76	my $verbosity = $self->{'verbosity'};
77	my $top_section = $doc_obj->get_top_section();
78
79	my $title = $doc_obj->get_metadata_element ($top_section, "Title");
80	print "process: $title\n" if ($verbosity > 1);
81
82
83	# only consider english-language files
84	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
85	my $phrlanguage = $self->{'language_exp'};
86	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
87
88	# record this file
89	my $total++;
90	print "file $total: $file\n" if ($self->{'$verbosity'});
91
92	# store object ID & title in document index file
93	my $OID = $doc_obj->get_OID();
94	$OID = "NULL" unless defined $OID;
95
96	my $dochandle = $self->{'dochandle'};
97	# print "dochandle: =$dochandle=\n";
98	print $dochandle "<Document>\t$OID\t$title\n";
99
100	# XXX
101	# Store the text of this object
102	my $indexlist = $self->{'indexes'};
103	my @parts;
104	my ($index, $part, $level, $field, $section, $data, $text);
105
106	# Output the document delimiter
107	my $txthandle = $self->{'txthandle'};
108	print $txthandle $self->{'delimiter'}, "\n";
109
110	# Iterarate over all the indexes specified in collect.cfg and
111	# add their text to the clauses file.
112	foreach $index (@$indexlist) {
113	$text = "";
114
115	# Iterate over all the feilds in each index
116	@parts = split(/,/, $index);
117	foreach $part (@parts) {
118
119	# Each field has a level and a data element ((e.g. document:Title)
120	($level, $field) = split(/:/, $part);
121	die unless ($level && $field);
122
123	# Extract the text from every section
124	# (In phind, document:text and section:text are equivalent)
125	if ($field eq "text") {
126	$data = "";
127	$section = $doc_obj->get_top_section();
128	while (defined($section)) {
129	$data .= $doc_obj->get_text($section) . "\n";
130	$section = $doc_obj->get_next_section($section);
131	}
132	$text .= convert_gml_to_tokens($data) . "\n";
133	}
134
135	# Extract a metadata field from a document
136	elsif ($level eq "document") {
137	$data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
138	$text .= convert_gml_to_tokens($data) . "\n";
139	}
140
141	# Extract metadata from every section in a document
142	elsif ($level eq "section") {
143	$data = "";
144	$section = $doc_obj->get_top_section();
145	while (defined($section)) {
146	$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
147	$section = $doc_obj->get_next_section($section);
148	}
149	$text .= convert_gml_to_tokens($data) . "\n";
150	}
151
152	# Some sort of specification which I don't understand
153	else {
154	die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
155	}
156
157	}
158
159	# print the text
160	print $txthandle "$text";
161
162	}
163	}
164
165
166
167	sub convert_gml_to_tokens {
168
169	$_ = shift @_;
170
171	# FIRST, remove GML tags
172
173	# Replace all whitespace with a simple space
174	s/\s+/ /gs;
175
176	# Remove everything that is in a tag
177	s/\s<p>\s/ PARAGRAPHBREAK /isg;
178	s/\s<br>\s/ LINEBREAK /isg;
179	s/<[^>]*>/ /sg;
180
181	# Now we have the text, but it may contain HTML
182	# elements coded as > etc. Remove these tags.
183	s/</</sg;
184	s/>/>/sg;
185
186	s/\s+/ /sg;
187	s/\s<p>\s/ PARAGRAPHBREAK /isg;
188	s/\s<br>\s/ LINEBREAK /isg;
189	s/<[^>]*>/ /sg;
190
191	# remove & and other miscellaneous markup tags
192	s/&/&/sg;
193	s/</</sg;
194	s/>/>/sg;
195	s/&/&/sg;
196
197	# replace<p> and <br> placeholders with carriage returns
198	s/PARAGRAPHBREAK/\n/sg;
199	s/LINEBREAK/\n/sg;
200
201
202	# Exceptional punctuation
203	#
204	# We make special cases of some punctuation
205
206	# remove any apostrophe that indicates omitted letters
207	s/(\w+)\'(\w*\s)/ $1$2 /g;
208
209	# remove period that appears in a person's initals
210	s/\s([A-Z])\./ $1 /g;
211
212	# replace hyphens in hypheanted words and names with a space
213	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
214
215
216	# Convert the remaining text to "clause format",
217	# This means removing all excess punctuation and garbage text,
218	# normalising valid punctuation to fullstops and commas,
219	# then putting one cluse on each line.
220
221	# Insert newline when the end of a sentence is detected
222	# (delimter is: "[\.\?\!]\s")
223	s/\s*[\.\?\!]\s+/\n/g;
224
225	# split numbers after four digits
226	s/(\d\d\d\d)/$1 /g;
227
228	# split words after 32 characters
229
230	# squash repeated punctuation
231	tr/A-Za-z0-9 //cs;
232
233	# save email addresses
234	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
235
236	# normalise clause breaks (mostly punctuation symbols) to commas
237	s/[^A-Za-z0-9 \n]+/ , /g;
238
239	# Remove repeated commas, and replace with newline
240	s/\s*,[, ]+/\n/g;
241
242	# remove extra whitespace
243	s/ +/ /sg;
244	s/^\s+//mg;
245	s/\s*$/\n/mg;
246
247	# remove lines that contain one word or less
248	s/^\w*$//mg;
249	s/^\s*$//mg;
250	tr/\n//s;
251
252	return $_;
253	}
254
255
256
257	1;
258

Note: See TracBrowser for help on using the repository browser.

Download in other formats: