source: trunk/gsdl/perllib/plugins/GBTPlug.pm@ 617

Last change on this file since 617 was 617, checked in by sjboddie, 25 years ago

a few fixes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.4 KB
Line 
1###########################################################################
2#
3# GBTPlug.pm -- plugin for processing GB encoded text
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates simple single-level document from .txt or .text files
27# (case-insensitive match on filenames). Adds Title metadata
28# of filename.
29
30package GBTPlug;
31
32use BasPlug;
33use sorttools;
34use unicode;
35use cnseg;
36use gb;
37
38
39sub BEGIN {
40 @ISA = ('BasPlug');
41}
42
43sub new {
44 my ($class) = @_;
45 $self = new BasPlug ();
46
47 return bless $self, $class;
48}
49
50sub is_recursive {
51 my $self = shift (@_);
52
53 return 0; # this is not a recursive plugin
54}
55
56
57# return number of files processed, undef if can't process
58# Note that $base_dir might be "" and that $file might
59# include directories
60sub read {
61 my $self = shift (@_);
62 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
63
64 my $filename = &util::filename_cat($base_dir, $file);
65
66 return undef unless ($filename =~ /\.(te?xt(\.gz)?)$/i && (-e $filename));
67
68 my $gz = 0;
69 if (defined $2) {
70 $gz = $2;
71 $gz = 1 if ($gz =~ /\.gz/i);
72 }
73
74 print STDERR "GBTPlug: processing $filename\n" if $processor->{'verbosity'};
75
76 # create a new document
77 my $doc_obj = new doc ($file, "indexed_doc");
78
79 if ($gz) {
80 open (FILE, "zcat $filename |") || die "TEXTPlug::read - zcat can't open $filename\n";
81 } else {
82 open (FILE, $filename) || die "TEXTPlug::read - can't open $filename\n";
83 }
84 my $cursection = $doc_obj->get_top_section();
85
86 my $text = "";
87 my $line = "";
88 while (defined ($line = <FILE>)) {
89 $text .= $line;
90 }
91
92 # convert to unicode
93 $text = &unicode::unicode2utf8(&gb::gb2unicode($text));
94
95 # segment the Chinese words
96 $text = &cnseg::segment($text);
97
98 $doc_obj->add_utf8_text($cursection, "<pre>$text</pre>");
99
100 # add Title metadata (filename);
101 my $title = $file;
102 $title =~ s/^[\/\\]+//;
103 $doc_obj->add_metadata($cursection, "Title", $title);
104
105 # assume that any metadata passed to this plugin is already utf8
106 foreach $field (keys(%$metadata)) {
107 # $metadata->{$field} may be an array reference
108 if (ref ($metadata->{$field}) eq "ARRAY") {
109 map {
110 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
111 } @{$metadata->{$field}};
112 } else {
113 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
114 }
115 }
116
117 # add OID
118 $doc_obj->set_OID ();
119
120 # process the document
121 $processor->process($doc_obj);
122
123 return 1; # processed the file
124}
125
1261;
127
128
129
130
131
132
133
134
135
136
137
Note: See TracBrowser for help on using the repository browser.