source: trunk/gsdl/perllib/plugins/FOXPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1###########################################################################
2#
3# FOXPlug.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin to process a Foxbase dbt file. This plugin provides the basic
27# functionality to read in the dbt and dbf files and process each record.
28# This general plugin should be overridden for a particular database to process
29# the appropriate fields in the file.
30
31# 12/05/02 Added usage datastructure - John Thompson
32
33package FOXPlug;
34
35use BasPlug;
36use util;
37use doc;
38use unicode;
39use cnseg;
40# use gb;
41
42
43sub BEGIN {
44 @ISA = ('BasPlug');
45}
46
47my $options = { 'name' => "FOXPlug",
48 'desc' => "{FOXPlug.desc}",
49 'abstract' => "no",
50 'inherits' => "yes" };
51
52sub new {
53 my ($class) = shift (@_);
54 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
55 push(@$pluginlist, $class);
56
57 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
58 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
59
60 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
61
62 return bless $self, $class;
63}
64
65sub is_recursive {
66 my $self = shift (@_);
67
68 return 0; # this is not a recursive plugin
69}
70
71
72# return number of files processed, undef if can't process
73# Note that $base_dir might be "" and that $file might
74# include directories
75sub read {
76 my $self = shift (@_);
77 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
78 my $fullname = &util::filename_cat ($base_dir, $file);
79
80 # dbt files are processed at the same time as dbf files
81 return 0 if ($fullname =~ /\.dbt$/i);
82
83 # see if this is a foxbase database
84 return undef unless (-f $fullname && $fullname =~ /\.dbf$/i);
85
86 my ($parent_dir) = $fullname =~ /^(.*)\/[^\/]+\.dbf$/i;
87
88 # open the file
89 if (!open (FOXBASEIN, $fullname)) {
90 if ($gli) {
91 print STDERR "<ProcessingError n='$file' r='Could not read $fullname'>\n";
92 }
93 print STDERR "FOXPlug::read - couldn't read $fullname\n";
94 return -1; # error in processing
95 }
96
97 print STDERR "<Processing n='$file' p='FOXPlug'>\n" if ($gli);
98
99 print STDERR "FOXPlug: processing $file\n";
100
101 # read in the database header
102 my ($temp, %dbf);
103
104 # read in information about dbt file
105 if (read (FOXBASEIN, $temp, 32) < 32) {
106 if ($gli) {
107 print STDERR "<ProcessingError n='$file' r='EOF while reading database header'>\n";
108 }
109 print STDERR "FOXPlug::read - eof while reading database header\n";
110 close (FOXBASEIN);
111 return -1;
112 }
113
114 # unpack the header
115 ($dbf{'hasdbt'},
116 $dbf{'modyear'}, $dbf{'modmonth'}, $dbf{'modday'},
117 $dbf{'numrecords'}, $dbf{'headerlen'},
118 $dbf{'recordlen'}) = unpack ("CCCCVvv", $temp);
119
120 # process hasdbt
121 if ($dbf{'hasdbt'} == 131) {
122 $dbf{'hasdbt'} = 1;
123 } elsif ($dbf{'hasdbt'} == 3 || $dbf{'hasdbt'} == 48) {
124 $dbf{'hasdbt'} = 0;
125 } else {
126 if ($gli) {
127 print STDERR "<ProcessingError n='$file' r='Does not seem to be a Foxbase file'>\n";
128 }
129 print STDERR "FOXPlug:read - $fullname doesn't seem to be a Foxbase file\n";
130 return -1;
131 }
132
133 # read in the field description
134 $dbf{'numfields'} = 0;
135 $dbf{'fieldinfo'} = [];
136 while (read (FOXBASEIN, $temp, 1) > 0) {
137 last if ($temp eq "\x0d");
138 last if (read (FOXBASEIN, $temp, 31, 1) < 31);
139
140 my %field = ();
141 $field{'name'} = $self->extracttext($temp, 11);
142 ($field{'type'}, $field{'pos'}, $field{'len'}, $field{'dp'})
143 = unpack ("x11a1VCC", $temp);
144
145 push (@{$dbf{'fieldinfo'}}, \%field);
146
147 $dbf{'numfields'} ++;
148 }
149
150 # open the dbt file if we need to
151 $dbtfullname = $fullname;
152 if ($fullname =~ /f$/) {
153 $dbtfullname =~ s/f$/t/;
154 } else {
155 $dbtfullname =~ s/F$/T/;
156 }
157 if ($dbf{'hasdbt'} && !open (DBTIN, $dbtfullname)) {
158 if ($gli) {
159 print STDERR "<ProcessingError n='$file' r='Could not read $dbtfullname'>\n";
160 }
161 print STDERR "FOXPlug::read - couldn't read $dbtfullname\n";
162 close (FOXBASEIN);
163 return -1;
164 }
165
166 # read in and process each record in the database
167 my $numrecords = 0;
168 while (($numrecords < $dbf{'numrecords'}) &&
169 (read (FOXBASEIN, $temp, $dbf{'recordlen'}) == $dbf{'recordlen'})) {
170
171 # create a new record
172 my $record = [];
173
174 foreach $field (@{$dbf{'fieldinfo'}}) {
175 my $fieldvalue = "";
176
177 if ($field->{'type'} eq "M" && $dbf{'hasdbt'}) {
178 # a memo field, look up this field in the dbt file
179 my $seekpos = substr ($temp, $field->{'pos'}, $field->{'len'});
180
181 $seekpos =~ s/^\s*//;
182 $seekpos = 0 unless $seekpos =~ /^\d+$/;
183
184 $seekpos = $seekpos * 512;
185
186 if ($seekpos == 0) {
187 # there is no memo field
188
189 } elsif (seek (DBTIN, $seekpos, 0)) {
190 while (read (DBTIN, $fieldvalue, 512, length($fieldvalue)) > 0) {
191 last if ($fieldvalue =~ /\cZ/);
192 }
193
194 # remove everything after the control-Z
195 substr($fieldvalue, index($fieldvalue, "\cZ")) = "";
196
197 } else {
198 print STDERR "\nERROR - seek (to $seekpos) failed\n";
199 }
200
201 } else {
202 # a normal field
203 $fieldvalue = substr ($temp, $field->{'pos'}, $field->{'len'});
204 }
205
206 push (@$record, {%$field, 'value'=>$fieldvalue});
207 }
208
209 # process this record
210 $self->process_record ($pluginfo, $base_dir, $file, $metadata, $processor,
211 $numrecords, $record);
212
213 # finished another record...
214 $numrecords++;
215 }
216
217 # close the dbt file if we need to
218 if ($dbf{'hasdbt'}) {
219 close (DBTIN);
220 }
221
222 # close the dbf file
223 close (FOXBASEIN);
224
225 # finished processing
226 return 1;
227}
228
229
230# will extract a string from some larger string, making it
231# conform to a number of constraints
232sub extracttext {
233 my $self = shift (@_);
234 my ($text, $maxlen, $offset, $stopstr) = @_;
235 $offset = 0 unless defined $offset;
236 $stopstr = "\x00" unless defined $stopstr;
237
238 # decide where the string finishes
239 my $end = index ($text, $stopstr, $offset);
240 $end = length ($text) if $end < 0;
241 $end = $offset+$maxlen if (defined $maxlen) && ($end-$offset > $maxlen);
242
243 return "" if ($end <= $offset);
244 return substr ($text, $offset, $end-$offset);
245}
246
247
248# process_record should be overriden for a particular type
249# of database. This default version outputs an html document
250# containing all the fields in the record as a table.
251# It also assumes that the text is in utf-8.
252sub process_record {
253 my $self = shift (@_);
254 my ($pluginfo, $base_dir, $file, $metadata, $processor, $numrecords, $record) = @_;
255
256 # create a new document
257 my $doc_obj = new doc ($file, "indexed_doc");
258 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
259 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
260 my $section = $doc_obj->get_top_section();
261
262 $doc_obj->add_metadata($section, "FileFormat", "FOX");
263 $doc_obj->add_metadata($section, "FileSize", (-s $file));
264
265 # start of document
266 $doc_obj->add_utf8_text($section, "<table>\n");
267
268 # add each field
269 foreach $field (@$record) {
270 if (defined ($field->{'name'}) && defined ($field->{'value'})) {
271 $doc_obj->add_utf8_text($section, " <tr>\n");
272 $doc_obj->add_utf8_text($section, " <td>$field->{'name'}</td>\n");
273 $doc_obj->add_utf8_text($section, " <td>$field->{'value'}</td>\n");
274 $doc_obj->add_utf8_text($section, " </tr>\n");
275 }
276 }
277
278 # end of document
279 $doc_obj->add_utf8_text($section, "</table>\n");
280
281 # add an object id
282 $doc_obj->set_OID();
283
284 # process the document
285 $processor->process($doc_obj);
286}
287
288
2891;
Note: See TracBrowser for help on using the repository browser.