source: trunk/gsdl/src/phind/generate/phproc.pm@ 1562

Last change on this file since 1562 was 1562, checked in by paynter, 24 years ago

Phind phrase browser - code for generating the phrase indexes

  • Property svn:keywords set to Author Date Id Revision
File size: 5.3 KB
Line 
1###########################################################################
2#
3# phproc.pm -- the Phind document processor
4#
5# Copyright (C) 2000 Gordon Paynter
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# This is the phind document processor object. It is used by the document
28# reader plugins to extract the clauses from each document.
29
30package phproc;
31
32use docproc;
33use util;
34
35sub BEGIN {
36 @ISA = ('docproc');
37}
38
39sub new {
40 my ($class, $archive_dir, $phindex_dir,
41 $language, $delimiter, $verbosity, $outhandle) = @_;
42
43 my $self = new docproc ();
44
45 # $self->{'collection'} = $collection;
46 $self->{'archive_dir'} = $archive_dir;
47 $self->{'phindex_dir'} = $phindex_dir;
48
49 $language =~ s/,/\|/g;
50 $self->{'language_exp'} = $language;
51 $self->{'delimiter'} = $delimiter;
52
53 $self->{'verbosity'} = $verbosity;
54 $self->{'outhandle'} = STDERR;
55 $self->{'outhandle'} = $outhandle if defined $outhandle;
56
57 &util::rm("$phindex_dir/clauses") if (-e "$phindex_dir/clauses");
58 open(TEXT, ">$phindex_dir/clauses")
59 || die "Cannot open $phindex_dir/clauses: $!";
60 $self->{'txthandle'} = TEXT;
61
62 &util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt");
63 open(DOCS, ">$phindex_dir/mg-d.txt")
64 || die "Cannot open $phindex_dir/mg-d.txt: $!";
65 $self->{'dochandle'} = DOCS;
66
67 return bless $self, $class;
68
69}
70
71sub process {
72 my $self = shift (@_);
73 my ($doc_obj) = @_;
74
75 my $verbosity = $self->{'verbosity'};
76 my $top_section = $doc_obj->get_top_section();
77
78 my $title = $doc_obj->get_metadata_element ($top_section, "Title");
79 print "process: $title\n" if ($verbosity > 1);
80
81
82 # only consider english-language files
83 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
84 my $phrlanguage = $self->{'language_exp'};
85 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
86
87 # record this file
88 my $total++;
89 print "file $total: $file\n" if ($self->{'$verbosity'});
90
91 # store object ID & title in document index file
92 my $OID = $doc_obj->get_OID();
93 $OID = "NULL" unless defined $OID;
94
95 my $dochandle = $self->{'dochandle'};
96 # print "dochandle: =$dochandle=\n";
97 print $dochandle "$OID\t$title\n";
98
99 # store the text
100 $text = convert_gml_to_tokens($doc_obj->get_text());
101
102 my $txthandle = $self->{'txthandle'};
103 print $txthandle $self->{'delimiter'}, "\n$text\n";
104}
105
106
107
108sub convert_gml_to_tokens {
109
110 $_ = shift @_;
111
112 # FIRST, remove GML tags
113
114 # Replace all whitespace with a simple space
115 s/\s+/ /gs;
116
117 # Remove everything that is in a tag
118 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
119 s/\s*<br>\s*/ LINEBREAK /isg;
120 s/<[^>]*>/ /sg;
121
122 # Now we have the text, but it may contain HTML
123 # elements coded as &gt; etc. Remove these tags.
124 s/&lt;/</sg;
125 s/&gt;/>/sg;
126
127 s/\s+/ /sg;
128 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
129 s/\s*<br>\s*/ LINEBREAK /isg;
130 s/<[^>]*>/ /sg;
131
132 # remove &amp; and other miscellaneous markup tags
133 s/&amp;/&/sg;
134 s/&lt;/</sg;
135 s/&gt;/>/sg;
136 s/&amp;/&/sg;
137
138 # replace<p> and <br> placeholders with carriage returns
139 s/PARAGRAPHBREAK/\n/sg;
140 s/LINEBREAK/\n/sg;
141
142
143 # Exceptional punctuation
144 #
145 # We make special cases of some punctuation
146
147 # remove any apostrophe that indicates omitted letters
148 s/(\w+)\'(\w*\s)/ $1$2 /g;
149
150 # remove period that appears in a person's initals
151 s/\s([A-Z])\./ $1 /g;
152
153 # replace hyphens in hypheanted words and names with a space
154 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
155
156
157 # Convert the remaining text to "clause format",
158 # This means removing all excess punctuation and garbage text,
159 # normalising valid punctuation to fullstops and commas,
160 # then putting one cluse on each line.
161
162 # Insert newline when the end of a sentence is detected
163 # (delimter is: "[\.\?\!]\s")
164 s/\s*[\.\?\!]\s+/\n/g;
165
166 # split numbers after four digits
167 s/(\d\d\d\d)/$1 /g;
168
169 # split words after 32 characters
170
171 # squash repeated punctuation
172 tr/A-Za-z0-9 //cs;
173
174 # save email addresses
175 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
176
177 # normalise clause breaks (mostly punctuation symbols) to commas
178 s/[^A-Za-z0-9 \n]+/ , /g;
179
180 # Remove repeated commas, and replace with newline
181 s/\s*,[, ]+/\n/g;
182
183 # remove extra whitespace
184 s/ +/ /sg;
185 s/^\s+//mg;
186 s/\s*$/\n/mg;
187
188 # remove lines that contain one word or less
189 s/^\w*$//mg;
190 s/^\s*$//mg;
191 tr/\n//s;
192
193 return $_;
194}
195
196
197
1981;
199
Note: See TracBrowser for help on using the repository browser.