source: trunk/gsdl3/bin/script/convert_coll_from_gs2.pl@ 10360

Last change on this file since 10360 was 10360, checked in by kjdon, 19 years ago

added use strict. worked on the format statement conversion. have done Or, still to do If

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.4 KB
Line 
1#!/usr/bin/perl -w
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDL3HOME not set\n" unless defined $ENV{'GSDL3HOME'};
6 die "GSDL3SRCHOME not set\n" unless defined $ENV{'GSDL3SRCHOME'};
7 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
8 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
9 unshift (@INC, "$ENV{'GSDL3SRCHOME'}/lib/perl/cpan");
10}
11
12use colcfg;
13use util;
14use parsargv;
15use FileHandle;
16use XML::Writer;
17use GDBM_File;
18
19use strict;
20
21&main();
22sub print_usage() {
23 print STDOUT "Usage: convert_coll_from_gs2.pl [options] coll-name\n";
24 print STDOUT "options:\n";
25
26 print STDOUT " -collectdir Directory where collection lives.\n";
27 print STDOUT " -verbosity Controls the amount of output.\n";
28 print STDOUT " -defaultlang The language that is considered the default (for display text etc). defaults to 'en'\n\n";
29}
30
31sub main {
32
33 my ($defaultlang, $verbosity, $collectdir);
34 # note that no defaults are passed for most options as they're set
35 # later (after we check the collect.cfg file)
36 if (!&parsargv::parse(\@ARGV,
37 'verbosity/\d+/', \$verbosity,
38 'collectdir/.*/', \$collectdir,
39 'defaultlang/.*/', \$defaultlang)) {
40 &print_usage();
41 die "\n";
42 }
43
44 # get and check the collection name
45 my ($collection) = @ARGV;
46 if (!defined($collection) || $collection eq "") {
47 die "No collection specified\n";
48 }
49 if ($collection eq "gs2model") {
50 die "You cant convert the model collection\n";
51 }
52
53 if (!defined $collectdir || $collectdir eq "") {
54 $collectdir = &util::filename_cat ($ENV{'GSDLHOME'}, "collect");
55 }
56
57 if (!defined $defaultlang || $defaultlang eq "") {
58 $defaultlang = 'en';
59 }
60 # add on the coll name
61 $collectdir = &util::filename_cat ($collectdir, $collection);
62
63 my $collconfigfilename = &util::filename_cat ($collectdir, "etc", "collect.cfg");
64 print STDOUT "coll config=$collconfigfilename\n";
65 my $collectcfg;
66 if (-e $collconfigfilename) {
67 $collectcfg = &colcfg::read_collect_cfg ($collconfigfilename);
68
69 } else {
70 print STDERR "collect.cfg not found!!";
71 die "\n";
72 }
73
74
75 my $buildconfigfilename = &util::filename_cat ($collectdir, "index", "build.cfg");
76 my $buildcfg;
77 if (-e $buildconfigfilename) {
78 $buildcfg = &colcfg::read_build_cfg ($buildconfigfilename);
79
80 } else {
81 print STDERR "build.cfg not found!!";
82 die "\n";
83 }
84
85
86
87 my $colloutfile = &util::filename_cat ($collectdir, "etc", "collectionConfig.xml");
88 if (-e $colloutfile) {
89 print STDOUT "collectionConfig file already exists! overwriting it!\n";
90
91 }
92
93 my $buildoutfile = &util::filename_cat ($collectdir, "index", "buildConfig.xml");
94 if (-e $buildoutfile) {
95 print STDOUT "buildConfig file already exists! overwriting it!\n";
96
97 }
98
99 my $db_file = &util::filename_cat ($collectdir, "index", "text", "$collection.ldb");
100 my $database;
101 if (-e $db_file) {
102 $database = &open_database($db_file);
103 } else {
104 print STDERR "gdbm database file $db_file not found!!";
105 die "\n";
106 }
107
108 my $buildtype;
109 if (defined $buildcfg->{'buildtype'}) {
110 $buildtype = $buildcfg->{'buildtype'};
111 } else {
112 $buildtype = 'mg';
113 }
114
115 my $buildoutput = new IO::File(">$buildoutfile");
116 my $buildwriter = new XML::Writer(OUTPUT => $buildoutput, NEWLINES => 1);
117
118 $buildwriter->startTag('buildConfig', 'xmlns:gsf'=>"http://www.greenstone.org/greenstone3/schema/ConfigFormat");
119
120 my $colloutput = new IO::File(">$colloutfile");
121 my $collwriter = new XML::Writer(OUTPUT => $colloutput, NEWLINES => 1);
122
123 $collwriter->startTag('collectionConfig', 'xmlns:gsf'=>"http://www.greenstone.org/greenstone3/schema/ConfigFormat", 'xmlns:xsl'=>'http://www.w3.org/1999/XSL/Transform');
124
125 #output the collection metadata to the collectionConfig file
126 $collwriter->startTag('metadataList');
127
128 my $creator = $collectcfg->{'creator'};
129 &output_metadata($collwriter, 'creator', $creator);
130 $collwriter->endTag('metadataList');
131
132 #output the display collectionmeta to collectionConfig.xml
133
134 my $collectionmeta = $collectcfg->{'collectionmeta'};
135 if (defined $collectionmeta) {
136 my %name_map = ('collectionname', 'name',
137 'collectionextra', 'description',
138 'iconcollection', 'icon',
139 'iconcollectionsmall', 'smallicon');
140
141 $collwriter->startTag('displayItemList');
142 foreach my $entry ( keys %$collectionmeta) {
143 if (defined $name_map{$entry}) {
144 my $name= $name_map{$entry};
145 foreach my $lang (keys %{$collectionmeta->{$entry}}) {
146 my $value = $collectionmeta->{$entry}->{$lang};
147 if ($entry =~ /^icon/) {
148 $value = format_icon_value($value);
149 }
150 &output_display($collwriter, $name, $lang, $value);
151 }
152 }
153 }
154 $collwriter->endTag('displayItemList');
155 }
156
157 # output building metadata to build config file
158 my $numdocs = $buildcfg->{'numdocs'};
159 $buildwriter->startTag('metadataList');
160 &output_metadata($buildwriter, 'numDocs', $numdocs);
161 &output_metadata($buildwriter, 'buildType', $buildtype);
162 $buildwriter->endTag('metadataList');
163
164
165 #indexes
166 my $indexmap = {};
167 my $firstindex = "";
168 my $first = 1;
169 if (defined $buildcfg->{'indexmap'}) {
170 my $indexmap_t = $buildcfg->{'indexmap'};
171 foreach my $i (@$indexmap_t) {
172 my ($k, $v) = $i =~ /^(.*)\-\>(.*)$/;
173 $indexmap->{$k} = $v;
174 if ($first) {
175 $firstindex = $v;
176 $first = 0;
177 }
178 }
179 } else {
180 print STDERR "indexmap not defined";
181 }
182 my $defaultindex;
183 if (defined $collectcfg->{'defaultindex'}) {
184 $defaultindex = $collectcfg->{'defaultindex'};
185 $defaultindex = $indexmap->{$defaultindex};
186 } else {
187 # use the first one
188 $defaultindex = $firstindex;
189 }
190
191 # format stuff
192 my $format = $collectcfg->{'format'};
193
194 #output the search stuff to coll cfg
195 $collwriter->startTag('search');
196 foreach my $i (keys %$indexmap) {
197 my $shortname = $indexmap->{$i};
198 $collwriter->startTag('index', 'name'=>$shortname);
199 #find the coll meta stuff
200 my $indexdisplay = ".$i";
201 foreach my $lang (keys %{$collectionmeta->{$indexdisplay}}) {
202 my $value = $collectionmeta->{$indexdisplay}->{$lang};
203 output_display($collwriter, 'name', $lang, $value);
204 }
205 $collwriter->endTag('index');
206
207 }
208
209 # add in the format stuff
210 if (defined $format->{'SearchVList'}) {
211
212 $collwriter->startTag('format');
213 write_format($collwriter, $format->{'SearchVList'}, "document");
214 $collwriter->endTag('format');
215 }
216
217 $collwriter->endTag('search');
218
219 $buildwriter->startTag('serviceRackList');
220
221 my @levels = ();
222 my $defaultlevel;
223
224 #do the retrieve service
225 # assume mgpp or mg
226 if ($buildtype eq 'mgpp') {
227 #for each level
228 if (defined $buildcfg->{'indexlevels'}) {
229 push @levels, @{$buildcfg->{'indexlevels'}};
230
231 if (defined $buildcfg->{'textlevel'}) {
232 $defaultlevel = $buildcfg->{'textlevel'};
233 } else {
234 $defaultlevel = $levels[0];
235 }
236 } else { #use levels from collect.cfg - must be an old collection
237 @levels = ('Document');
238 $defaultlevel = 'Document';
239 if (defined $collectcfg->{'levels'}) {
240 foreach my $l (@{$collectcfg->{'levels'}}) {
241 if ($l eq "Section") {
242 $defaultlevel = 'Section';
243 }
244 push @levels, $l;
245 }
246 }
247 }
248
249 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGPPRetrieve');
250 $buildwriter->emptyTag('defaultLevel', 'name'=>$defaultlevel);
251
252
253 } else {
254 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGRetrieve');
255 $buildwriter->emptyTag('defaultIndex', 'name'=>$defaultindex);
256
257 }
258 # close off the Retrieve service
259 $buildwriter->endTag('serviceRack');
260
261 # add in the classifiers if needed
262
263 my $count = 1;
264 my $phind = 0;
265 my $started_classifiers = 0;
266 if (defined $collectcfg->{'classify'}) {
267 $collwriter->startTag('browse');
268 # add in default format if necessary
269 if (defined $format->{"VList"} || defined $format->{"HList"}) {
270 # global formats
271 $collwriter->startTag('format');
272 if (defined $format->{"VList"}) {
273 # VLIst applies to both classifier and doc nodes
274 write_format($collwriter, $format->{"VList"}, "document");
275 write_format($collwriter, $format->{"VList"}, "classifier");
276 }
277 if (defined $format->{"HList"}) {
278 # hlist is only for classifier nodes
279 write_format($collwriter, $format->{"HList"}, "horizontal");
280 }
281 $collwriter->endTag('format');
282 }
283 my $classifiers = $collectcfg->{'classify'};
284 foreach my $cl (@$classifiers) {
285 my $name = "CL$count";
286 $count++;
287 my ($classname) = @$cl[0];
288 if ($classname =~ /^phind$/i) {
289 $phind=1;
290 #should add it into coll config classifiers
291 next;
292 }
293
294 my $horizontalAtTop = &isHorizontalClassifier($database, $name);
295 if (not $started_classifiers) {
296 $buildwriter->startTag('serviceRack', 'name'=>'GS2Browse');
297 $buildwriter->startTag('classifierList');
298 $started_classifiers = 1;
299 }
300 my $content = ''; #use buttonname first, then metadata
301 if ($classname eq "DateList") {
302 $content = "Date";
303 } else {
304 for (my $i=0; $i<scalar(@$cl); $i++) {
305 my $arg = @$cl[$i];
306 if ($arg eq "-buttonname"){
307 $content = @$cl[$i+1];
308 last;
309 } elsif ($arg eq "-metadata") {
310 $content = @$cl[$i+1];
311 }
312
313 }
314 }
315 if ($horizontalAtTop) {
316 $buildwriter->emptyTag('classifier', 'name'=>$name, 'content'=>$content, 'horizontalAtTop'=>'true');
317 } else {
318 $buildwriter->emptyTag('classifier', 'name'=>$name, 'content'=>$content);
319 }
320
321
322 $collwriter->startTag('classifier', 'name'=>$name);
323 my $vlist = $name."VList";
324 my $hlist = $name."HList";
325 my $dlist = "";
326 if ($classname eq "DateList") {
327 $dlist = "DateList";
328 }
329 # need to work out how to split into classifier and document
330 if (defined $format->{$vlist} || defined $format->{$hlist} || defined $format->{$dlist}) {
331 $collwriter->startTag('format');
332 if (defined $format->{$vlist}) {
333 write_format($collwriter, $format->{$vlist}, "document");
334 write_format($collwriter, $format->{$vlist}, "classifier");
335 }
336 if (defined $format->{$hlist}) {
337 write_format($collwriter, $format->{$hlist}, "horizontal");
338 }
339
340 if (defined $format->{$dlist}) {
341 write_format($collwriter, $format->{$dlist}, "document");
342 }
343 $collwriter->endTag('format');
344 }
345 $collwriter->endTag('classifier');
346 } #foreach classifier
347 if ($started_classifiers) {
348 # end the classifiers
349 $buildwriter->endTag('classifierList');
350 # close off the Browse service
351 $buildwriter->endTag('serviceRack');
352 }
353
354 $collwriter->endTag('browse');
355 }
356
357
358 # the phind classifier is a separate service
359 if ($phind) {
360 # if phind classifier
361 $buildwriter->emptyTag('serviceRack', 'name'=>'PhindPhraseBrowse');
362 }
363
364 # do the search service
365 if ($buildtype eq 'mgpp') {
366
367 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGPPSearch');
368 $buildwriter->emptyTag('defaultLevel', 'name'=>$defaultlevel);
369 $buildwriter->startTag('levelList');
370 foreach my $level (@levels) {
371 $buildwriter->emptyTag('level', 'name'=>$level);
372 }
373 $buildwriter->endTag('levelList');
374
375 #fieldlist
376 my $fieldmap = {};
377 my $fieldlist = ();
378 if (defined $buildcfg->{'indexfields'}) {
379 push @$fieldlist, @{$buildcfg->{'indexfields'}};
380 if (defined $buildcfg->{'indexfieldmap'}) {
381 my $fieldmap_t = $buildcfg->{'indexfieldmap'};
382 foreach my $f (@$fieldmap_t) {
383 my ($k, $v) = $f =~ /^(.*)\-\>(.*)$/;
384 $fieldmap->{$k} = $v;
385 }
386 }
387 $buildwriter->startTag('fieldList');
388 foreach my $f (@$fieldlist) {
389 my $field = $fieldmap->{$f};
390 $buildwriter->emptyTag('field', 'shortname'=>$field, 'name'=>$f);
391 }
392 $buildwriter->endTag('fieldList');
393 } else {
394 print STDERR "indexfieldmap not defined";
395 }
396
397 # do the search types if there
398 if (defined $collectcfg->{'searchtype'}) {
399 $buildwriter->startTag('searchTypeList');
400 foreach my $st (@{$collectcfg->{'searchtype'}}) {
401 $buildwriter->emptyTag('searchType', 'name'=>$st);
402 }
403 $buildwriter->endTag('searchTypeList');
404 }
405 } elsif ($buildtype eq 'mg') {
406 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGSearch');
407 }
408
409
410 $buildwriter->emptyTag('defaultIndex', 'name'=>$defaultindex);
411 $buildwriter->startTag('indexList');
412 #for each index
413 foreach my $i (keys %$indexmap) {
414 my $index = $indexmap->{$i};
415 $buildwriter->emptyTag('index', 'name'=>$index);
416 }
417 $buildwriter->endTag('indexList');
418
419 $buildwriter->endTag('serviceRack');
420
421 $buildwriter->endTag('serviceRackList');
422 $buildwriter->endTag('buildConfig');
423 $collwriter->endTag('collectionConfig');
424 $collwriter->end();
425 $buildwriter->end();
426 $buildoutput->close();
427 $colloutput->close();
428 &close_database($database);
429}
430
431
432sub output_metadata {
433 my ($writer, $metaname, $metavalue) = @_;
434 $writer->startTag('metadata', 'name'=>$metaname);
435 $writer->characters($metavalue);
436 $writer->endTag('metadata');
437}
438
439sub output_display {
440 my ($writer, $name, $lang, $value) = @_;
441 $lang = 'en' if $lang eq 'default';
442 if ($lang =~ /^\[/) {
443 ($lang) = $lang =~ /\[l=(.*)\]/;
444 }
445 $writer->startTag('displayItem', 'name'=>$name, 'lang'=>$lang);
446 $writer->characters($value);
447 $writer->endTag('displayItem');
448}
449sub format_icon_value {
450 my ($value) = @_;
451 if ($value =~ /^_/) {
452 my ($newvalue) = $value =~ /\/([^\/]*)$/;
453 if ($newvalue) {
454 return $newvalue;
455 }
456 }
457 return $value;
458}
459
460sub write_format {
461 my ($writer, $old_format, $node_type) = @_;
462 # replace \' with '
463 $old_format =~ s/\\\'/\'/g;
464
465 #convert [] to <gsf:...>
466 # assume no nesting {If} or {Or} for now
467 #remove IFs, for now just make the first option true
468 #$old_format =~ s/\{If\}\{[^,]*,([^,\}]*)(,[^\}]*)?\}/$1/g;
469 #remove ORs, for now just make the first option true
470 $old_format =~ s/\{If\}\{([^\}]*)\}/&format_if($1, $node_type)/eg;
471 $old_format =~ s/\{Or\}\{([^\}]*)\}/&format_or($1)/eg;
472 # $old_format =~ s/\{Or\}\{([^,]*),[^\}]*\}/$1/g;
473 $old_format =~ s/\[Text\]/\<gsf:text\/\>/g;
474 $old_format =~ s/\[num\]/\<gsf:num\/\>/g;
475 $old_format =~ s/\[link\]/\<gsf:link\>/g;
476 $old_format =~ s/\[\/link\]/\<\/gsf:link\>/g;
477 $old_format =~ s/\[srclink\]/\<gsf:link type=\'source\'\>/g;
478 $old_format =~ s/\[\/srclink\]/\<\/gsf:link\>/g;
479 $old_format =~ s/\[icon\]/\<gsf:icon\/\>/g;
480 $old_format =~ s/\[srcicon\]/\<gsf:icon type=\'source\'\/\>/g;
481
482 # what to do with hightlight??
483 $old_format =~ s/\[\/?highlight\]//g;
484
485 #now do the rest of the [] which are assumed to be metadata
486 $old_format =~ s/\[([^\]]*)\]/&format_metadata($1)/eg;
487 # $old_format =~ s/\[([^\]]*\:)?([^\]\:]*)\]/\<gsf:metadata name=\'$2\' select=\'$1\'\/\>/g;
488 # # remove ex.
489 # $old_format =~ s/<gsf:metadata name=\'ex\.([^\']+)\'/<gsf:metadata name=\'$1\'/g;
490 # #do the parent stuff
491 # $old_format =~ s/(select=\'parent)\:\'/$1\'/g;
492 # $old_format =~ s/select=\'parent\(Top\)\:\'/select=\'root\'/g;
493 # $old_format =~ s/select=\'parent\(All\)\:\'/select=\'ancestors\'/g;
494 # $old_format =~ s/select=\'parent\(All\'([^\']*)\'\)\:\'/select=\'ancestors\' separator=\'$1\'/g;
495 # #remove any select=''
496 # $old_format =~ s/select=\'\'//g;
497
498 # some html tidy
499 #turn <br> into <br />
500 $old_format =~ s/\<br\>/\<br \/\>/g;
501 #turn <p> into <p />
502 $old_format =~ s/\<p\>/\<p \/\>/g;
503
504 #put quotes around any atts
505 $old_format =~ s/=([a-z]+)([> ])/=\'$1\'$2/g;
506
507 if ($node_type eq "document") {
508 $writer->startTag('gsf:template', 'match'=>'documentNode');
509 $writer->charactersXML($old_format);
510 $writer->endTag('gsf:template');
511 } elsif ($node_type eq "classifier") {
512 $writer->startTag('gsf:template', 'match'=>'classifierNode');
513 $writer->charactersXML($old_format);
514 $writer->endTag('gsf:template');
515 } elsif ($node_type eq "horizontal") {
516 $writer->startTag('gsf:template', 'match'=>'classifierNode', 'mode'=>'horizontal');
517 $writer->charactersXML($old_format);
518 $writer->endTag('gsf:template');
519
520 }
521}
522
523sub format_metadata {
524 my ($metadata_string) = @_;
525
526 #print STDERR "original meta = $metadata_string\n";
527
528 # what shall we do with cgisafe??
529 my $cgisafe = $metadata_string =~ s/^cgisafe://;
530
531 my ($select) = $metadata_string =~ /^(parent|sibling)/;
532 $metadata_string =~ s/^(parent|sibling)//;
533 my ($scope, $delim);
534
535 if ($select) {
536 ($scope) = $metadata_string =~ /^\((Top|All)/;
537 $metadata_string =~ s/^\((Top|All)\)?//;
538 if ($scope) {
539 ($delim) = $metadata_string =~ /^:\'([^\']*)\'\)/;
540 $metadata_string =~ s/^:\'([^\']*)\'\)//;
541 }
542 }
543 $metadata_string =~ s/^://;
544 # remove ex.
545 $metadata_string =~ s/^ex\.//;
546
547 #print STDERR "select=$select, scope=$scope, delim=|$delim|, meta = $metadata_string\n";
548
549 my $new_format = "<gsf:metadata name='$metadata_string' ";
550 if (defined $select) {
551 if ($select eq "sibling") {
552 $new_format .= "multiple='true' ";
553 if (defined $delim) {
554 $new_format .= "separator='$delim' ";
555 }
556 } elsif ($select eq "parent"){
557 if (defined $scope) {
558 if ($scope eq "Top") {
559 $new_format .= "select='root' ";
560 } elsif ($scope eq "All") {
561 $new_format .= "select='ancestors' ";
562 if (defined $delim) {
563 $new_format .= "separator='$delim' ";
564 }
565 }
566 } else {
567 $new_format .= "select='parent' ";
568 }
569 }
570 }
571 $new_format .= "/>";
572 #print STDERR "$new_format\n";
573 return $new_format;
574
575}
576
577sub format_if {
578
579 my ($if_string, $node_type) = @_;
580
581 return "if statement to go here";
582}
583
584sub format_or {
585 my ($or_string) = @_;
586 my @meta_list = split (',', $or_string);
587 return "" unless scalar (@meta_list);
588 my $new_format = "<gsf:choose-metadata>";
589 foreach my $m (@meta_list) {
590 if ($m =~ /^\[(.*)\]$/) {
591 $new_format .= &format_metadata($1);
592 } else {
593 # a default value
594 $new_format .= "<gsf:default>$m</gsf:default>";
595 last;
596 }
597 }
598 $new_format .= "</gsf:choose-metadata>";
599 return $new_format;
600}
601
602sub open_database {
603 my ($db_file) = @_;
604
605 my $database = ();
606 tie (%$database, 'GDBM_File', $db_file, &GDBM_READER, 0400) ||
607 die "Couldn't open database $db_file\n";
608
609 return $database;
610}
611
612sub close_database {
613 my ($database) = @_;
614 untie %$database;
615}
616sub isHorizontalClassifier {
617 my ($database, $name) = @_;
618
619 my $record = $database->{$name};
620 my ($childtype) = $record =~ /<childtype>(\w*)/;
621 if ($childtype eq "HList") {
622 return 1;
623 }
624 return 0;
625}
626#$writer->startTag('');
627#$writer->endTag('');
628#$writer->characters();
629#$writer->emptyTag('');
630
6311;
Note: See TracBrowser for help on using the repository browser.