source: trunk/gsdl/src/phind/generate/phproc.pm@ 1604

Last change on this file since 1604 was 1604, checked in by paynter, 24 years ago

Numerous improvements for use with the new phindcgi script. The main ones
are that three MGPP databases are now created (document data, phrase data,
and word search) and that the data extracted from each document is set
explicitly in the collection configuration file (usually it will be
something like document:text or section:Title).

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1###########################################################################
2#
3# phproc.pm -- the Phind document processor
4#
5# Copyright (C) 2000 Gordon Paynter
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# This is the phind document processor object. It is used by the document
28# reader plugins to extract the clauses from each document.
29
30package phproc;
31
32use docproc;
33use util;
34
35sub BEGIN {
36 @ISA = ('docproc');
37}
38
39sub new {
40 my ($class, $archive_dir, $phindex_dir, $phindcfg,
41 $language, $delimiter, $verbosity, $outhandle) = @_;
42
43 my $self = new docproc ();
44
45 $self->{'archive_dir'} = $archive_dir;
46 $self->{'phindex_dir'} = $phindex_dir;
47 $self->{'indexes'} = $phindcfg;
48
49 $language =~ s/,/\|/g;
50 $self->{'language_exp'} = $language;
51 $self->{'delimiter'} = $delimiter;
52
53 $self->{'verbosity'} = $verbosity;
54 $self->{'outhandle'} = STDERR;
55 $self->{'outhandle'} = $outhandle if defined $outhandle;
56
57 &util::rm("$phindex_dir/clauses") if (-e "$phindex_dir/clauses");
58 open(TEXT, ">$phindex_dir/clauses")
59 || die "Cannot open $phindex_dir/clauses: $!";
60 $self->{'txthandle'} = TEXT;
61
62 my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
63 &util::rm($docfile) if (-e $docfile);
64 open(DOCS, ">$docfile")
65 || die "Cannot open $docfile: $!";
66 $self->{'dochandle'} = DOCS;
67
68 return bless $self, $class;
69
70}
71
72sub process {
73 my $self = shift (@_);
74 my ($doc_obj) = @_;
75
76 my $verbosity = $self->{'verbosity'};
77 my $top_section = $doc_obj->get_top_section();
78
79 my $title = $doc_obj->get_metadata_element ($top_section, "Title");
80 print "process: $title\n" if ($verbosity > 1);
81
82
83 # only consider english-language files
84 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
85 my $phrlanguage = $self->{'language_exp'};
86 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
87
88 # record this file
89 my $total++;
90 print "file $total: $file\n" if ($self->{'$verbosity'});
91
92 # store object ID & title in document index file
93 my $OID = $doc_obj->get_OID();
94 $OID = "NULL" unless defined $OID;
95
96 my $dochandle = $self->{'dochandle'};
97 # print "dochandle: =$dochandle=\n";
98 print $dochandle "<Document>\t$OID\t$title\n";
99
100 # XXX
101 # Store the text of this object
102 my $indexlist = $self->{'indexes'};
103 my @parts;
104 my ($index, $part, $level, $field, $section, $data, $text);
105
106 # Output the document delimiter
107 my $txthandle = $self->{'txthandle'};
108 print $txthandle $self->{'delimiter'}, "\n";
109
110 # Iterarate over all the indexes specified in collect.cfg and
111 # add their text to the clauses file.
112 foreach $index (@$indexlist) {
113 $text = "";
114
115 # Iterate over all the feilds in each index
116 @parts = split(/,/, $index);
117 foreach $part (@parts) {
118
119 # Each field has a level and a data element ((e.g. document:Title)
120 ($level, $field) = split(/:/, $part);
121 die unless ($level && $field);
122
123 # Extract the text from every section
124 # (In phind, document:text and section:text are equivalent)
125 if ($field eq "text") {
126 $data = "";
127 $section = $doc_obj->get_top_section();
128 while (defined($section)) {
129 $data .= $doc_obj->get_text($section) . "\n";
130 $section = $doc_obj->get_next_section($section);
131 }
132 $text .= convert_gml_to_tokens($data) . "\n";
133 }
134
135 # Extract a metadata field from a document
136 elsif ($level eq "document") {
137 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
138 $text .= convert_gml_to_tokens($data) . "\n";
139 }
140
141 # Extract metadata from every section in a document
142 elsif ($level eq "section") {
143 $data = "";
144 $section = $doc_obj->get_top_section();
145 while (defined($section)) {
146 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
147 $section = $doc_obj->get_next_section($section);
148 }
149 $text .= convert_gml_to_tokens($data) . "\n";
150 }
151
152 # Some sort of specification which I don't understand
153 else {
154 die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
155 }
156
157 }
158
159 # print the text
160 print $txthandle "$text";
161
162 }
163}
164
165
166
167sub convert_gml_to_tokens {
168
169 $_ = shift @_;
170
171 # FIRST, remove GML tags
172
173 # Replace all whitespace with a simple space
174 s/\s+/ /gs;
175
176 # Remove everything that is in a tag
177 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
178 s/\s*<br>\s*/ LINEBREAK /isg;
179 s/<[^>]*>/ /sg;
180
181 # Now we have the text, but it may contain HTML
182 # elements coded as &gt; etc. Remove these tags.
183 s/&lt;/</sg;
184 s/&gt;/>/sg;
185
186 s/\s+/ /sg;
187 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
188 s/\s*<br>\s*/ LINEBREAK /isg;
189 s/<[^>]*>/ /sg;
190
191 # remove &amp; and other miscellaneous markup tags
192 s/&amp;/&/sg;
193 s/&lt;/</sg;
194 s/&gt;/>/sg;
195 s/&amp;/&/sg;
196
197 # replace<p> and <br> placeholders with carriage returns
198 s/PARAGRAPHBREAK/\n/sg;
199 s/LINEBREAK/\n/sg;
200
201
202 # Exceptional punctuation
203 #
204 # We make special cases of some punctuation
205
206 # remove any apostrophe that indicates omitted letters
207 s/(\w+)\'(\w*\s)/ $1$2 /g;
208
209 # remove period that appears in a person's initals
210 s/\s([A-Z])\./ $1 /g;
211
212 # replace hyphens in hypheanted words and names with a space
213 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
214
215
216 # Convert the remaining text to "clause format",
217 # This means removing all excess punctuation and garbage text,
218 # normalising valid punctuation to fullstops and commas,
219 # then putting one cluse on each line.
220
221 # Insert newline when the end of a sentence is detected
222 # (delimter is: "[\.\?\!]\s")
223 s/\s*[\.\?\!]\s+/\n/g;
224
225 # split numbers after four digits
226 s/(\d\d\d\d)/$1 /g;
227
228 # split words after 32 characters
229
230 # squash repeated punctuation
231 tr/A-Za-z0-9 //cs;
232
233 # save email addresses
234 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
235
236 # normalise clause breaks (mostly punctuation symbols) to commas
237 s/[^A-Za-z0-9 \n]+/ , /g;
238
239 # Remove repeated commas, and replace with newline
240 s/\s*,[, ]+/\n/g;
241
242 # remove extra whitespace
243 s/ +/ /sg;
244 s/^\s+//mg;
245 s/\s*$/\n/mg;
246
247 # remove lines that contain one word or less
248 s/^\w*$//mg;
249 s/^\s*$//mg;
250 tr/\n//s;
251
252 return $_;
253}
254
255
256
2571;
258
Note: See TracBrowser for help on using the repository browser.