1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # SRCPlug.pm -- source code plugin
|
---|
4 | #
|
---|
5 | # A component of the Greenstone digital library software
|
---|
6 | # from the New Zealand Digital Library Project at the
|
---|
7 | # University of Waikato, New Zealand.
|
---|
8 | #
|
---|
9 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
10 | #
|
---|
11 | # This program is free software; you can redistribute it and/or modify
|
---|
12 | # it under the terms of the GNU General Public License as published by
|
---|
13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
14 | # (at your option) any later version.
|
---|
15 | #
|
---|
16 | # This program is distributed in the hope that it will be useful,
|
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | # GNU General Public License for more details.
|
---|
20 | #
|
---|
21 | # You should have received a copy of the GNU General Public License
|
---|
22 | # along with this program; if not, write to the Free Software
|
---|
23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | #
|
---|
25 | ###########################################################################
|
---|
26 | # John McPherson Nov 2000
|
---|
27 | # originally based on TEXTPlug
|
---|
28 |
|
---|
29 | # filename is currently used for Title ( optionally minus some prefix )
|
---|
30 |
|
---|
31 | # Current languages:
|
---|
32 | # text: READMEs/Makefiles
|
---|
33 | # C/C++ (currently extracts #include statements and C++ class decls)
|
---|
34 | # Perl (currently only done as text)
|
---|
35 | # Shell (currently only done as text)
|
---|
36 |
|
---|
37 | # 12/05/02 Added usage datastructure - John Thompson
|
---|
38 |
|
---|
39 | package SRCPlug;
|
---|
40 |
|
---|
41 | use BasPlug;
|
---|
42 | use parsargv;
|
---|
43 |
|
---|
44 | sub BEGIN {
|
---|
45 | @ISA = ('BasPlug');
|
---|
46 | }
|
---|
47 |
|
---|
48 | my $arguments =
|
---|
49 | [ { 'name' => "process_exp",
|
---|
50 | 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
|
---|
51 | 'type' => "string",
|
---|
52 | 'deft' => &get_default_process_exp(),
|
---|
53 | 'reqd' => "no" } ,
|
---|
54 | { 'name' => "block_exp",
|
---|
55 | 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
|
---|
56 | 'type' => 'string',
|
---|
57 | 'deft' => &get_default_block_exp(),
|
---|
58 | 'reqd' => "no" } ,
|
---|
59 | { 'name' => "remove_prefix",
|
---|
60 | 'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.",
|
---|
61 | 'type' => 'string',
|
---|
62 | 'deft' => "",
|
---|
63 | 'reqd' => "no" } ];
|
---|
64 |
|
---|
65 | my $options = { 'name' => "SRCPlug",
|
---|
66 | 'desc' => "Filename is currently used for Title ( optionally minus some prefix ). Current languages:\ntext: READMEs/Makefiles\nC/C++ (currently extracts #include statements and C++ class decls)\nPerl (currently only done as text)\nShell (currently only done as text)",
|
---|
67 | 'inherits' => "yes",
|
---|
68 | 'args' => $arguments };
|
---|
69 |
|
---|
70 | # sub print_usage {
|
---|
71 | # print STDERR "\n usage: plugin SRCPlug [options]\n";
|
---|
72 | # print STDERR "Try to import C and C++ source code. Adds \"class\"";
|
---|
73 | # print STDERR " metadata.\n\n";
|
---|
74 | # print STDERR " options:\n";
|
---|
75 | # print STDERR " -remove_prefix <pattern> Remove this leading pattern from the filename\n";
|
---|
76 | # print STDERR " (eg -remove_prefix /tmp/XX/src/). The default is to\n";
|
---|
77 | # print STDERR " remove the whole path from the filename.\n";
|
---|
78 | # print STDERR "\n";
|
---|
79 | # }
|
---|
80 |
|
---|
81 | sub new {
|
---|
82 | my ($class) = @_;
|
---|
83 | my $self = new BasPlug ($class, @_);
|
---|
84 |
|
---|
85 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
86 | my $option_list = $self->{'option_list'};
|
---|
87 | push( @{$option_list}, $options );
|
---|
88 |
|
---|
89 | if (!parsargv::parse(\@_,
|
---|
90 | q^remove_prefix/(\S+)/^, \$self->{'remove_prefix'},
|
---|
91 | "allow_extra_options"
|
---|
92 | )
|
---|
93 | ) {
|
---|
94 | print STDERR "\nIncorrect options passed to SRCPlug, ";
|
---|
95 | print STDERR "check your collect.cfg configuration file\n";
|
---|
96 | $self->print_txt_usage();
|
---|
97 | die "\n";
|
---|
98 | }
|
---|
99 | return bless $self, $class;
|
---|
100 | }
|
---|
101 |
|
---|
102 | sub get_default_block_exp {
|
---|
103 | my $self = shift (@_);
|
---|
104 |
|
---|
105 | return q^(?i)\.(o|obj|a|so|dll)$^;
|
---|
106 | }
|
---|
107 |
|
---|
108 | sub get_default_process_exp {
|
---|
109 | my $self = shift (@_);
|
---|
110 |
|
---|
111 | # return q^(?i)\.te?xt$^;
|
---|
112 | return q^(Makefile.*|README.*|(?i)\.(c|cc|cpp|C|h|hpp|pl|pm|sh))$^;
|
---|
113 | }
|
---|
114 |
|
---|
115 |
|
---|
116 |
|
---|
117 | # do plugin specific processing of doc_obj
|
---|
118 | sub process {
|
---|
119 | my $self = shift (@_);
|
---|
120 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
121 | my $outhandle = $self->{'outhandle'};
|
---|
122 |
|
---|
123 | print $outhandle "SRCPlug: processing $file\n"
|
---|
124 | if $self->{'verbosity'} > 1;
|
---|
125 |
|
---|
126 | my $cursection = $doc_obj->get_top_section();
|
---|
127 |
|
---|
128 | my $filetype="text"; # Makefiles, READMEs, ...
|
---|
129 | if ($file =~ /\.(cc|h|cpp|C)$/) {$filetype="C++";} # assume all .h files...
|
---|
130 | elsif ($file =~ /\.c$/) {$filetype="C";}
|
---|
131 | elsif ($file =~ /\.p(l|m)$/) {$filetype="perl";}
|
---|
132 | elsif ($file =~ /\.sh$/) {$filetype="sh";}
|
---|
133 |
|
---|
134 | # modify '<' and '>' for GML... (even though inside <pre> tags!!)
|
---|
135 | $$textref =~ s/</</g;
|
---|
136 | $$textref =~ s/>/>/g;
|
---|
137 | $$textref =~ s/_/_/g;
|
---|
138 | # try _escape_text($text) from doc.pm....
|
---|
139 |
|
---|
140 | # don't want mg to turn escape chars into actual values
|
---|
141 | $$textref =~ s/\\/\\\\/g;
|
---|
142 |
|
---|
143 | # use filename (minus any prefix) as the title.
|
---|
144 | my $title;
|
---|
145 | if ($self->{'remove_prefix' ne ""}) {
|
---|
146 | ($title = $file) =~ s/^$self->{'remove_prefix'}//;
|
---|
147 | } else {
|
---|
148 | ($title = $file) =~ s@^.*[/\\]@@; # remove pathname by default
|
---|
149 | }
|
---|
150 | $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
|
---|
151 | # remove the gsdl prefix from the filename
|
---|
152 | my $relative_filename=$file;
|
---|
153 | $relative_filename =~ s@^.*?gsdl[/\\]@@;
|
---|
154 | $doc_obj->add_utf8_metadata ($cursection, "filename", $relative_filename);
|
---|
155 |
|
---|
156 | # class information from .h and .cc and .C and .cpp files
|
---|
157 | if ($filetype eq "C++")
|
---|
158 | {
|
---|
159 | process_c_plus_plus($textref,$pluginfo, $base_dir,
|
---|
160 | $file, $metadata, $doc_obj);
|
---|
161 | } elsif ($filetype eq "C")
|
---|
162 | {
|
---|
163 | get_includes_metadata($textref, $doc_obj);
|
---|
164 | }
|
---|
165 |
|
---|
166 |
|
---|
167 | # default operation...
|
---|
168 | # insert preformat tags and add text to document object
|
---|
169 | $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
|
---|
170 |
|
---|
171 | return 1;
|
---|
172 | }
|
---|
173 |
|
---|
174 |
|
---|
175 |
|
---|
176 |
|
---|
177 | sub get_includes_metadata {
|
---|
178 | my ($textref, $doc_obj) = @_;
|
---|
179 |
|
---|
180 | my $topsection = $doc_obj->get_top_section();
|
---|
181 |
|
---|
182 | # Get '#include' directives for metadata
|
---|
183 | if ($$textref !~ /\#\s*include\b/) {
|
---|
184 | return;
|
---|
185 | }
|
---|
186 |
|
---|
187 | my @includes =
|
---|
188 | ($$textref =~ m/^\s*\#\s*include\s*(?:\"|<)(.*?)(?:\"|>)/mg);
|
---|
189 |
|
---|
190 | my $incs_done_ref=$doc_obj->get_metadata($section, "includes");
|
---|
191 | my @incs_done;
|
---|
192 | if (defined($incs_done_ref)) {
|
---|
193 | @incs_done=@$incs_done_ref;
|
---|
194 | } else {
|
---|
195 | @incs_done=();
|
---|
196 | }
|
---|
197 |
|
---|
198 | foreach my $inc (@includes) {
|
---|
199 | # add entries, but only if they don't already exist
|
---|
200 | if (!join('', map {$_ eq "$inc"?1:""} @incs_done)) {
|
---|
201 | push @incs_done, $inc;
|
---|
202 | $doc_obj->add_utf8_metadata($topsection, "includes", $inc);
|
---|
203 | }
|
---|
204 | }
|
---|
205 | }
|
---|
206 |
|
---|
207 |
|
---|
208 |
|
---|
209 | sub process_c_plus_plus {
|
---|
210 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
211 |
|
---|
212 | my $topsection = $doc_obj->get_top_section();
|
---|
213 |
|
---|
214 |
|
---|
215 | # Check for include metadata
|
---|
216 | get_includes_metadata($textref, $doc_obj);
|
---|
217 |
|
---|
218 |
|
---|
219 |
|
---|
220 | # Get class declarations (but not forward declarations...) as metadata
|
---|
221 | if ($$textref =~ /\bclass\b/ ) {
|
---|
222 | my $classnames=$$textref;
|
---|
223 |
|
---|
224 | # remove commented lines
|
---|
225 | $classnames =~ s@/\*.*?\*/@@sg;
|
---|
226 | $classnames =~ s@//.*$@@mg;
|
---|
227 | while ($classnames =~ /\bclass\b/) {
|
---|
228 |
|
---|
229 | # delete all lines up to the next "class"
|
---|
230 | while ($classnames !~ /^[^\n]*\bclass\b[^\n]*\n/)
|
---|
231 | {$classnames =~ s/.*\n//;}
|
---|
232 |
|
---|
233 | # $classnames =~ s/^([^c][^l])*(.)?$//mg; # delete unneccessary lines
|
---|
234 |
|
---|
235 | # get the line including the next "class" and remove it from
|
---|
236 | # our tmp text.
|
---|
237 | $classnames =~ s/^(.*\bclass\b.*)$//m;
|
---|
238 |
|
---|
239 | # don't index if merely a reference/fwd decl. of another class
|
---|
240 | if ($1 !~ /(friend\Wclass)|(class\W\w+\W?\;)|(\/\/.*class)/) {
|
---|
241 | # $1 is still the whole line - eg:
|
---|
242 | # "class StaffSystem: public BaseStaffSystem"
|
---|
243 | my $wholeline=$1;
|
---|
244 | my $classname=$1;
|
---|
245 | $classname =~ s/.*class\W(\w+).*/$1/;
|
---|
246 | my $classes=$doc_obj->get_metadata($section, "class");
|
---|
247 | foreach my $elem (@$classes) {
|
---|
248 | if ("$elem" eq "$classname") {goto class_done;}
|
---|
249 | }
|
---|
250 | $doc_obj->add_utf8_metadata($topsection, "class", $classname);
|
---|
251 | class_done:
|
---|
252 | $doc_obj->add_utf8_metadata($topsection, "classdecl", $wholeline);
|
---|
253 | }
|
---|
254 | }
|
---|
255 | } # end of "class"
|
---|
256 |
|
---|
257 | return 1;
|
---|
258 | }
|
---|
259 |
|
---|
260 | 1;
|
---|
261 |
|
---|