root/gsdl/trunk/perllib/plugins/FOXPlugin.pm @ 18327

Revision 18327, 8.7 KB (checked in by ak19, 11 years ago)

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# FOXPlugin.pm
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin to process a Foxbase dbt file. This plugin provides the basic
27# functionality to read in the dbt and dbf files and process each record.
28# This general plugin should be overridden for a particular database to process
29# the appropriate fields in the file.
30
31package FOXPlugin;
32
33use BasePlugin;
34use util;
35use doc;
36use unicode;
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41
42sub BEGIN {
43    @FOXPlugin::ISA = ('BasePlugin');
44}
45
46my $arguments =
47    [ { 'name' => "process_exp",
48    'desc' => "{BasePlugin.process_exp}",
49    'type' => "regexp",
50    'reqd' => "no",
51    'deft' => &get_default_process_exp() },
52      { 'name' => "block_exp",
53    'desc' => "{BasePlugin.block_exp}",
54    'type' => "regexp",
55    'reqd' => "no",
56    'deft' => &get_default_block_exp() } ];
57
58my $options = { 'name'     => "FOXPlugin",
59        'desc'     => "{FOXPlugin.desc}",
60        'abstract' => "no",
61        'inherits' => "yes",
62        'args'     => $arguments };
63
64sub new {
65    my ($class) = shift (@_);
66    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
67    push(@$pluginlist, $class);
68
69    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70    push(@{$hashArgOptLists->{"OptList"}},$options);
71
72    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
73
74    return bless $self, $class;
75}
76
77sub get_default_process_exp {
78    my $self = shift (@_);
79
80    return q^(?i)\.dbf$^;
81}
82
83#dbt files are processed at the same time as dbf files
84sub get_default_block_exp {
85    my $self = shift (@_);
86
87    return q^(?i)\.dbt$^;
88}
89
90# return number of files processed, undef if can't process
91# Note that $base_dir might be "" and that $file might
92# include directories
93sub read {
94    my $self = shift (@_);
95    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
96 
97    # can we process this file??
98    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
99    return undef unless $self->can_process_this_file($filename_full_path);
100
101    print STDERR "<Processing n='$file' p='FOXPlugin'>\n" if ($gli);
102    print STDERR "FOXPlugin: processing $file\n" if $self->{'verbosity'} > 1;
103
104    my ($parent_dir) = $filename_full_path =~ /^(.*)\/[^\/]+\.dbf$/i;
105
106    # open the file
107    if (!open (FOXBASEIN, $filename_full_path)) {
108    if ($gli) {
109        print STDERR "<ProcessingError n='$file' r='Could not read $filename_full_path'>\n";
110    }
111    print STDERR "FOXPlugin::read - couldn't read $filename_full_path\n";
112    return -1; # error in processing
113    }
114
115    # read in the database header
116    my ($temp, %dbf);
117   
118    # read in information about dbt file
119    if (read (FOXBASEIN, $temp, 32) < 32) {
120    if ($gli) {
121        print STDERR "<ProcessingError n='$file' r='EOF while reading database header'>\n";
122    }
123    print STDERR "FOXPlugin::read - eof while reading database header\n";
124    close (FOXBASEIN);
125    return -1;
126    }
127   
128    # unpack the header
129    ($dbf{'hasdbt'},
130     $dbf{'modyear'}, $dbf{'modmonth'}, $dbf{'modday'},
131     $dbf{'numrecords'}, $dbf{'headerlen'},
132     $dbf{'recordlen'}) = unpack ("CCCCVvv", $temp);
133   
134    # process hasdbt
135    if ($dbf{'hasdbt'} == 131) {
136    $dbf{'hasdbt'} = 1;
137    } elsif ($dbf{'hasdbt'} == 3 || $dbf{'hasdbt'} == 48) {
138    $dbf{'hasdbt'} = 0;
139    } else {
140    if ($gli) {
141        print STDERR "<ProcessingError n='$file' r='Does not seem to be a Foxbase file'>\n";
142    }
143    print STDERR "FOXPlugin:read - $filename_full_path doesn't seem to be a Foxbase file\n";
144    return -1;
145    }
146
147    # read in the field description
148    $dbf{'numfields'} = 0;
149    $dbf{'fieldinfo'} = [];
150    while (read (FOXBASEIN, $temp, 1) > 0) {
151    last if ($temp eq "\x0d");
152    last if (read (FOXBASEIN, $temp, 31, 1) < 31);
153
154    my %field = ();
155    $field{'name'} = $self->extracttext($temp, 11);
156    ($field{'type'}, $field{'pos'}, $field{'len'}, $field{'dp'})
157        = unpack ("x11a1VCC", $temp);
158
159    push (@{$dbf{'fieldinfo'}}, \%field);
160
161    $dbf{'numfields'} ++;
162    }
163
164    # open the dbt file if we need to
165    my $dbtfullname = $filename_full_path;
166    if ($filename_full_path =~ /f$/) {
167    $dbtfullname =~ s/f$/t/;
168    } else {
169    $dbtfullname =~ s/F$/T/;
170    }
171    if ($dbf{'hasdbt'} && !open (DBTIN, $dbtfullname)) {
172    if ($gli) {
173        print STDERR "<ProcessingError n='$file' r='Could not read $dbtfullname'>\n";
174    }
175    print STDERR "FOXPlugin::read - couldn't read $dbtfullname\n";
176    close (FOXBASEIN);
177    return -1;
178    }
179
180    # read in and process each record in the database
181    my $numrecords = 0;
182    while (($numrecords < $dbf{'numrecords'}) &&
183       (read (FOXBASEIN, $temp, $dbf{'recordlen'}) == $dbf{'recordlen'})) {
184
185    # create a new record
186    my $record = [];
187   
188    foreach my $field (@{$dbf{'fieldinfo'}}) {
189        my $fieldvalue = "";
190       
191        if ($field->{'type'} eq "M" && $dbf{'hasdbt'}) {
192        # a memo field, look up this field in the dbt file
193        my $seekpos = substr ($temp, $field->{'pos'}, $field->{'len'});
194
195        $seekpos =~ s/^\s*//;
196        $seekpos = 0 unless $seekpos =~ /^\d+$/;
197       
198        $seekpos = $seekpos * 512;
199
200        if ($seekpos == 0) {
201            # there is no memo field
202
203        } elsif (seek (DBTIN, $seekpos, 0)) {
204            while (read (DBTIN, $fieldvalue, 512, length($fieldvalue)) > 0) {
205            last if ($fieldvalue =~ /\cZ/);
206            }
207
208            # remove everything after the control-Z
209            substr($fieldvalue, index($fieldvalue, "\cZ")) = "";
210
211        } else {
212            print STDERR "\nERROR - seek (to $seekpos) failed\n";
213        }
214
215        } else {
216        # a normal field
217        $fieldvalue = substr ($temp, $field->{'pos'}, $field->{'len'});
218        }
219
220        push (@$record, {%$field, 'value'=>$fieldvalue});
221    }
222
223    # process this record
224    $self->process_record ($pluginfo, $base_dir, $file, $metadata, $processor,
225                   $numrecords, $record);
226   
227    # finished another record...
228    $numrecords++;
229    }
230
231    # close the dbt file if we need to
232    if ($dbf{'hasdbt'}) {
233    close (DBTIN);
234    }
235
236    # close the dbf file
237    close (FOXBASEIN);
238
239    # finished processing
240    return 1;
241}
242
243
244# will extract a string from some larger string, making it
245# conform to a number of constraints
246sub extracttext {
247    my $self = shift (@_);
248    my ($text, $maxlen, $offset, $stopstr) = @_;
249    $offset = 0 unless defined $offset;
250    $stopstr = "\x00" unless defined $stopstr;
251   
252    # decide where the string finishes
253    my $end = index ($text, $stopstr, $offset);
254    $end = length ($text) if $end < 0;
255    $end = $offset+$maxlen if (defined $maxlen) && ($end-$offset > $maxlen);
256   
257    return "" if ($end <= $offset);
258    return substr ($text, $offset, $end-$offset);
259}
260
261
262# process_record should be overriden for a particular type
263# of database. This default version outputs an html document
264# containing all the fields in the record as a table.
265# It also assumes that the text is in utf-8.
266sub process_record {
267    my $self = shift (@_);
268    my ($pluginfo, $base_dir, $file, $metadata, $processor, $numrecords, $record) = @_;
269
270    # create a new document
271    my $doc_obj = new doc ($file, "indexed_doc", $self->{'file_rename_method'});
272
273    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
274    my $section = $doc_obj->get_top_section();
275
276    $doc_obj->add_metadata($section, "FileFormat", "FOX");
277    $doc_obj->add_metadata($section, "FileSize",   (-s $file));
278
279    # start of document
280    $doc_obj->add_utf8_text($section, "<table>\n");
281
282    # add each field
283    foreach my $field (@$record) {
284    if (defined ($field->{'name'}) && defined ($field->{'value'})) {
285        $doc_obj->add_utf8_text($section, "  <tr>\n");
286        $doc_obj->add_utf8_text($section, "    <td>$field->{'name'}</td>\n");
287        $doc_obj->add_utf8_text($section, "    <td>$field->{'value'}</td>\n");
288        $doc_obj->add_utf8_text($section, "  </tr>\n");
289    }
290    }
291
292    # end of document
293    $doc_obj->add_utf8_text($section, "</table>\n");
294
295    # add an object id
296    $self->add_OID($doc_obj);
297
298    # process the document
299    $processor->process($doc_obj);
300}
301
302
3031;
Note: See TracBrowser for help on using the browser.