source: trunk/gsdl3/bin/script/convert_coll_from_gs2.pl@ 10653

Last change on this file since 10653 was 10653, checked in by kjdon, 19 years ago

added indexStem element into service racks if needed

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1#!/usr/bin/perl -w
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDL3HOME not set\n" unless defined $ENV{'GSDL3HOME'};
6 die "GSDL3SRCHOME not set\n" unless defined $ENV{'GSDL3SRCHOME'};
7 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
8 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
9 unshift (@INC, "$ENV{'GSDL3SRCHOME'}/lib/perl/cpan");
10}
11
12use colcfg;
13use util;
14use parsargv;
15use FileHandle;
16use XML::Writer;
17use GDBM_File;
18
19use strict;
20
21&main();
22sub print_usage() {
23 print STDOUT "Usage: convert_coll_from_gs2.pl [options] coll-name\n";
24 print STDOUT "options:\n";
25
26 print STDOUT " -collectdir Directory where collection lives.\n";
27 print STDOUT " -verbosity Controls the amount of output.\n";
28 print STDOUT " -defaultlang The language that is considered the default (for display text etc). defaults to 'en'\n\n";
29}
30
31sub main {
32
33 my ($defaultlang, $verbosity, $collectdir);
34 # note that no defaults are passed for most options as they're set
35 # later (after we check the collect.cfg file)
36 if (!&parsargv::parse(\@ARGV,
37 'verbosity/\d+/', \$verbosity,
38 'collectdir/.*/', \$collectdir,
39 'defaultlang/.*/', \$defaultlang)) {
40 &print_usage();
41 die "\n";
42 }
43
44 # get and check the collection name
45 my ($collection) = @ARGV;
46 if (!defined($collection) || $collection eq "") {
47 die "No collection specified\n";
48 }
49 if ($collection eq "gs2model") {
50 die "You cant convert the model collection\n";
51 }
52
53 if (!defined $collectdir || $collectdir eq "") {
54 $collectdir = &util::filename_cat ($ENV{'GSDLHOME'}, "collect");
55 }
56
57 if (!defined $defaultlang || $defaultlang eq "") {
58 $defaultlang = 'en';
59 }
60 # add on the coll name
61 $collectdir = &util::filename_cat ($collectdir, $collection);
62
63 my $collconfigfilename = &util::filename_cat ($collectdir, "etc", "collect.cfg");
64 print STDOUT "coll config=$collconfigfilename\n";
65 my $collectcfg;
66 if (-e $collconfigfilename) {
67 $collectcfg = &colcfg::read_collect_cfg ($collconfigfilename);
68
69 } else {
70 print STDERR "collect.cfg not found!!";
71 die "\n";
72 }
73
74
75 my $buildconfigfilename = &util::filename_cat ($collectdir, "index", "build.cfg");
76 my $buildcfg;
77 if (-e $buildconfigfilename) {
78 $buildcfg = &colcfg::read_build_cfg ($buildconfigfilename);
79
80 } else {
81 print STDERR "build.cfg not found!!";
82 die "\n";
83 }
84
85
86
87 my $colloutfile = &util::filename_cat ($collectdir, "etc", "collectionConfig.xml");
88 if (-e $colloutfile) {
89 print STDOUT "collectionConfig file already exists! overwriting it!\n";
90
91 }
92
93 my $buildoutfile = &util::filename_cat ($collectdir, "index", "buildConfig.xml");
94 if (-e $buildoutfile) {
95 print STDOUT "buildConfig file already exists! overwriting it!\n";
96
97 }
98
99 my $db_file = &util::filename_cat ($collectdir, "index", "text", "$collection.ldb");
100 my $database;
101 if (-e $db_file) {
102 $database = &open_database($db_file);
103 } else {
104 print STDERR "gdbm database file $db_file not found!!";
105 die "\n";
106 }
107
108 my $buildtype;
109 if (defined $buildcfg->{'buildtype'}) {
110 $buildtype = $buildcfg->{'buildtype'};
111 } else {
112 $buildtype = 'mg';
113 }
114
115 my $indexstem = undef;
116 if (defined $buildcfg->{'indexstem'}) {
117 $indexstem = $buildcfg->{'indexstem'};
118 }
119
120 my $buildoutput = new IO::File(">$buildoutfile");
121 my $buildwriter = new XML::Writer(OUTPUT => $buildoutput, NEWLINES => 1);
122
123 $buildwriter->startTag('buildConfig', 'xmlns:gsf'=>"http://www.greenstone.org/greenstone3/schema/ConfigFormat");
124
125 my $colloutput = new IO::File(">$colloutfile");
126 my $collwriter = new XML::Writer(OUTPUT => $colloutput, NEWLINES => 1);
127
128 $collwriter->startTag('collectionConfig', 'xmlns:gsf'=>"http://www.greenstone.org/greenstone3/schema/ConfigFormat", 'xmlns:xsl'=>'http://www.w3.org/1999/XSL/Transform');
129
130 #output the collection metadata to the collectionConfig file
131 $collwriter->startTag('metadataList');
132
133 my $creator = $collectcfg->{'creator'};
134 &output_metadata($collwriter, 'creator', $creator);
135 $collwriter->endTag('metadataList');
136
137 #output the display collectionmeta to collectionConfig.xml
138
139 my $collectionmeta = $collectcfg->{'collectionmeta'};
140 if (defined $collectionmeta) {
141 my %name_map = ('collectionname', 'name',
142 'collectionextra', 'description',
143 'iconcollection', 'icon',
144 'iconcollectionsmall', 'smallicon');
145
146 $collwriter->startTag('displayItemList');
147 foreach my $entry ( keys %$collectionmeta) {
148 if (defined $name_map{$entry}) {
149 my $name= $name_map{$entry};
150 foreach my $lang (keys %{$collectionmeta->{$entry}}) {
151 my $value = $collectionmeta->{$entry}->{$lang};
152 if ($entry =~ /^icon/) {
153 $value = format_icon_value($value);
154 }
155 &output_display($collwriter, $name, $lang, $value);
156 }
157 }
158 }
159 $collwriter->endTag('displayItemList');
160 }
161
162 # output building metadata to build config file
163 my $numdocs = $buildcfg->{'numdocs'};
164 $buildwriter->startTag('metadataList');
165 &output_metadata($buildwriter, 'numDocs', $numdocs);
166 &output_metadata($buildwriter, 'buildType', $buildtype);
167 $buildwriter->endTag('metadataList');
168
169
170 #indexes
171 my $indexmap = {};
172 my $firstindex = "";
173 my $first = 1;
174 if (defined $buildcfg->{'indexmap'}) {
175 my $indexmap_t = $buildcfg->{'indexmap'};
176 foreach my $i (@$indexmap_t) {
177 my ($k, $v) = $i =~ /^(.*)\-\>(.*)$/;
178 $indexmap->{$k} = $v;
179 if ($first) {
180 $firstindex = $v;
181 $first = 0;
182 }
183 }
184 } else {
185 print STDERR "indexmap not defined";
186 }
187 my $defaultindex;
188 if (defined $collectcfg->{'defaultindex'}) {
189 $defaultindex = $collectcfg->{'defaultindex'};
190 $defaultindex = $indexmap->{$defaultindex};
191 } else {
192 # use the first one
193 $defaultindex = $firstindex;
194 }
195
196 # format stuff
197 my $format = $collectcfg->{'format'};
198
199 #output the search stuff to coll cfg
200 $collwriter->startTag('search');
201 foreach my $i (keys %$indexmap) {
202 my $shortname = $indexmap->{$i};
203 $collwriter->startTag('index', 'name'=>$shortname);
204 #find the coll meta stuff
205 my $indexdisplay = ".$i";
206 foreach my $lang (keys %{$collectionmeta->{$indexdisplay}}) {
207 my $value = $collectionmeta->{$indexdisplay}->{$lang};
208 output_display($collwriter, 'name', $lang, $value);
209 }
210 $collwriter->endTag('index');
211
212 }
213
214 # add in the format stuff
215 if (defined $format->{'SearchVList'}) {
216
217 $collwriter->startTag('format');
218 write_format($collwriter, $format->{'SearchVList'}, "document");
219 $collwriter->endTag('format');
220 }
221
222 $collwriter->endTag('search');
223
224 $buildwriter->startTag('serviceRackList');
225
226 my @levels = ();
227 my $defaultlevel;
228
229 #do the retrieve service
230 # assume mgpp or mg
231 if ($buildtype eq 'mgpp') {
232 #for each level
233 if (defined $buildcfg->{'indexlevels'}) {
234 push @levels, @{$buildcfg->{'indexlevels'}};
235
236 if (defined $buildcfg->{'textlevel'}) {
237 $defaultlevel = $buildcfg->{'textlevel'};
238 } else {
239 $defaultlevel = $levels[0];
240 }
241 } else { #use levels from collect.cfg - must be an old collection
242 @levels = ('Document');
243 $defaultlevel = 'Document';
244 if (defined $collectcfg->{'levels'}) {
245 foreach my $l (@{$collectcfg->{'levels'}}) {
246 if ($l eq "Section") {
247 $defaultlevel = 'Section';
248 }
249 push @levels, $l;
250 }
251 }
252 }
253
254 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGPPRetrieve');
255 $buildwriter->emptyTag('defaultLevel', 'name'=>$defaultlevel);
256
257
258 } else {
259 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGRetrieve');
260 $buildwriter->emptyTag('defaultIndex', 'name'=>$defaultindex);
261
262 }
263 if (defined $indexstem) {
264 $buildwriter->emptyTag('indexStem', 'name'=>$indexstem);
265 }
266 # close off the Retrieve service
267 $buildwriter->endTag('serviceRack');
268
269 # add in the classifiers if needed
270
271 my $count = 1;
272 my $phind = 0;
273 my $started_classifiers = 0;
274 if (defined $collectcfg->{'classify'}) {
275 $collwriter->startTag('browse');
276 # add in default format if necessary
277 if (defined $format->{"VList"} || defined $format->{"HList"}) {
278 # global formats
279 $collwriter->startTag('format');
280 if (defined $format->{"VList"}) {
281 # VLIst applies to both classifier and doc nodes
282 write_format($collwriter, $format->{"VList"}, "document");
283 write_format($collwriter, $format->{"VList"}, "classifier");
284 }
285 if (defined $format->{"HList"}) {
286 # hlist is only for classifier nodes
287 write_format($collwriter, $format->{"HList"}, "horizontal");
288 }
289 $collwriter->endTag('format');
290 }
291 my $classifiers = $collectcfg->{'classify'};
292 foreach my $cl (@$classifiers) {
293 my $name = "CL$count";
294 $count++;
295 my ($classname) = @$cl[0];
296 if ($classname =~ /^phind$/i) {
297 $phind=1;
298 #should add it into coll config classifiers
299 next;
300 }
301
302 my $horizontalAtTop = &isHorizontalClassifier($database, $name);
303 if (not $started_classifiers) {
304 $buildwriter->startTag('serviceRack', 'name'=>'GS2Browse');
305 if (defined $indexstem) {
306 $buildwriter->emptyTag('indexStem', 'name'=>$indexstem);
307 }
308
309 $buildwriter->startTag('classifierList');
310 $started_classifiers = 1;
311 }
312 my $content = ''; #use buttonname first, then metadata
313 if ($classname eq "DateList") {
314 $content = "Date";
315 } else {
316 for (my $i=0; $i<scalar(@$cl); $i++) {
317 my $arg = @$cl[$i];
318 if ($arg eq "-buttonname"){
319 $content = @$cl[$i+1];
320 last;
321 } elsif ($arg eq "-metadata") {
322 $content = @$cl[$i+1];
323 }
324
325 }
326 }
327 if ($horizontalAtTop) {
328 $buildwriter->emptyTag('classifier', 'name'=>$name, 'content'=>$content, 'horizontalAtTop'=>'true');
329 } else {
330 $buildwriter->emptyTag('classifier', 'name'=>$name, 'content'=>$content);
331 }
332
333
334 $collwriter->startTag('classifier', 'name'=>$name);
335 my $vlist = $name."VList";
336 my $hlist = $name."HList";
337 my $dlist = "";
338 if ($classname eq "DateList") {
339 $dlist = "DateList";
340 }
341 # need to work out how to split into classifier and document
342 if (defined $format->{$vlist} || defined $format->{$hlist} || defined $format->{$dlist}) {
343 $collwriter->startTag('format');
344 if (defined $format->{$vlist}) {
345 write_format($collwriter, $format->{$vlist}, "document");
346 write_format($collwriter, $format->{$vlist}, "classifier");
347 }
348 if (defined $format->{$hlist}) {
349 write_format($collwriter, $format->{$hlist}, "horizontal");
350 }
351
352 if (defined $format->{$dlist}) {
353 write_format($collwriter, $format->{$dlist}, "document");
354 }
355 $collwriter->endTag('format');
356 }
357 $collwriter->endTag('classifier');
358 } #foreach classifier
359 if ($started_classifiers) {
360 # end the classifiers
361 $buildwriter->endTag('classifierList');
362 # close off the Browse service
363 $buildwriter->endTag('serviceRack');
364 }
365
366 $collwriter->endTag('browse');
367 }
368
369
370 # the phind classifier is a separate service
371 if ($phind) {
372 # if phind classifier
373 $buildwriter->emptyTag('serviceRack', 'name'=>'PhindPhraseBrowse');
374 }
375
376 # do the search service
377 if ($buildtype eq 'mgpp') {
378
379 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGPPSearch');
380 $buildwriter->emptyTag('defaultLevel', 'name'=>$defaultlevel);
381 $buildwriter->startTag('levelList');
382 foreach my $level (@levels) {
383 $buildwriter->emptyTag('level', 'name'=>$level);
384 }
385 $buildwriter->endTag('levelList');
386
387 #fieldlist
388 my $fieldmap = {};
389 my $fieldlist = ();
390 if (defined $buildcfg->{'indexfields'}) {
391 push @$fieldlist, @{$buildcfg->{'indexfields'}};
392 if (defined $buildcfg->{'indexfieldmap'}) {
393 my $fieldmap_t = $buildcfg->{'indexfieldmap'};
394 foreach my $f (@$fieldmap_t) {
395 my ($k, $v) = $f =~ /^(.*)\-\>(.*)$/;
396 $fieldmap->{$k} = $v;
397 }
398 }
399 $buildwriter->startTag('fieldList');
400 foreach my $f (@$fieldlist) {
401 my $field = $fieldmap->{$f};
402 $buildwriter->emptyTag('field', 'shortname'=>$field, 'name'=>$f);
403 }
404 $buildwriter->endTag('fieldList');
405 } else {
406 print STDERR "indexfieldmap not defined";
407 }
408
409 # do the search types if there
410 if (defined $collectcfg->{'searchtype'}) {
411 $buildwriter->startTag('searchTypeList');
412 foreach my $st (@{$collectcfg->{'searchtype'}}) {
413 $buildwriter->emptyTag('searchType', 'name'=>$st);
414 }
415 $buildwriter->endTag('searchTypeList');
416 }
417 } elsif ($buildtype eq 'mg') {
418 $buildwriter->startTag('serviceRack', 'name'=>'GS2MGSearch');
419 }
420
421 if (defined $indexstem) {
422 $buildwriter->emptyTag('indexStem', 'name'=>$indexstem);
423 }
424
425 $buildwriter->emptyTag('defaultIndex', 'name'=>$defaultindex);
426 $buildwriter->startTag('indexList');
427 #for each index
428 foreach my $i (keys %$indexmap) {
429 my $index = $indexmap->{$i};
430 $buildwriter->emptyTag('index', 'name'=>$index);
431 }
432 $buildwriter->endTag('indexList');
433
434 $buildwriter->endTag('serviceRack');
435
436 $buildwriter->endTag('serviceRackList');
437 $buildwriter->endTag('buildConfig');
438 $collwriter->endTag('collectionConfig');
439 $collwriter->end();
440 $buildwriter->end();
441 $buildoutput->close();
442 $colloutput->close();
443 &close_database($database);
444}
445
446
447sub output_metadata {
448 my ($writer, $metaname, $metavalue) = @_;
449 $writer->startTag('metadata', 'name'=>$metaname);
450 $writer->characters($metavalue);
451 $writer->endTag('metadata');
452}
453
454sub output_display {
455 my ($writer, $name, $lang, $value) = @_;
456 $lang = 'en' if $lang eq 'default';
457 if ($lang =~ /^\[/) {
458 ($lang) = $lang =~ /\[l=(.*)\]/;
459 }
460 $writer->startTag('displayItem', 'name'=>$name, 'lang'=>$lang);
461 $writer->characters($value);
462 $writer->endTag('displayItem');
463}
464sub format_icon_value {
465 my ($value) = @_;
466 if ($value =~ /^_/) {
467 my ($newvalue) = $value =~ /\/([^\/]*)$/;
468 if ($newvalue) {
469 return $newvalue;
470 }
471 }
472 return $value;
473}
474
475sub write_format {
476 my ($writer, $old_format, $node_type) = @_;
477 # replace \' with '
478 $old_format =~ s/\\\'/\'/g;
479
480 #convert [] to <gsf:...>
481 # assume no nesting {If} or {Or} for now
482 #remove IFs, for now just make the first option true
483 #$old_format =~ s/\{If\}\{[^,]*,([^,\}]*)(,[^\}]*)?\}/$1/g;
484 #remove ORs, for now just make the first option true
485 $old_format =~ s/\{If\}\{([^\}]*)\}/&format_if($1, $node_type)/eg;
486 $old_format =~ s/\{Or\}\{([^\}]*)\}/&format_or($1)/eg;
487 # $old_format =~ s/\{Or\}\{([^,]*),[^\}]*\}/$1/g;
488 $old_format =~ s/\[Text\]/\<gsf:text\/\>/g;
489 $old_format =~ s/\[num\]/\<gsf:num\/\>/g;
490 $old_format =~ s/\[link\]/\<gsf:link\>/g;
491 $old_format =~ s/\[\/link\]/\<\/gsf:link\>/g;
492 $old_format =~ s/\[srclink\]/\<gsf:link type=\'source\'\>/g;
493 $old_format =~ s/\[\/srclink\]/\<\/gsf:link\>/g;
494 $old_format =~ s/\[icon\]/\<gsf:icon\/\>/g;
495 $old_format =~ s/\[srcicon\]/\<gsf:icon type=\'source\'\/\>/g;
496
497 # what to do with hightlight??
498 $old_format =~ s/\[\/?highlight\]//g;
499
500 #now do the rest of the [] which are assumed to be metadata
501 $old_format =~ s/\[([^\]]*)\]/&format_metadata($1)/eg;
502 # $old_format =~ s/\[([^\]]*\:)?([^\]\:]*)\]/\<gsf:metadata name=\'$2\' select=\'$1\'\/\>/g;
503 # # remove ex.
504 # $old_format =~ s/<gsf:metadata name=\'ex\.([^\']+)\'/<gsf:metadata name=\'$1\'/g;
505 # #do the parent stuff
506 # $old_format =~ s/(select=\'parent)\:\'/$1\'/g;
507 # $old_format =~ s/select=\'parent\(Top\)\:\'/select=\'root\'/g;
508 # $old_format =~ s/select=\'parent\(All\)\:\'/select=\'ancestors\'/g;
509 # $old_format =~ s/select=\'parent\(All\'([^\']*)\'\)\:\'/select=\'ancestors\' separator=\'$1\'/g;
510 # #remove any select=''
511 # $old_format =~ s/select=\'\'//g;
512
513 # some html tidy
514 #turn <br> into <br />
515 $old_format =~ s/\<br\>/\<br \/\>/g;
516 #turn <p> into <p />
517 $old_format =~ s/\<p\>/\<p \/\>/g;
518
519 #put quotes around any atts
520 $old_format =~ s/=([a-z]+)([> ])/=\'$1\'$2/g;
521
522 if ($node_type eq "document") {
523 $writer->startTag('gsf:template', 'match'=>'documentNode');
524 $writer->charactersXML($old_format);
525 $writer->endTag('gsf:template');
526 } elsif ($node_type eq "classifier") {
527 $writer->startTag('gsf:template', 'match'=>'classifierNode');
528 $writer->charactersXML($old_format);
529 $writer->endTag('gsf:template');
530 } elsif ($node_type eq "horizontal") {
531 $writer->startTag('gsf:template', 'match'=>'classifierNode', 'mode'=>'horizontal');
532 $writer->charactersXML($old_format);
533 $writer->endTag('gsf:template');
534
535 }
536}
537
538sub format_metadata {
539 my ($metadata_string) = @_;
540
541 #print STDERR "original meta = $metadata_string\n";
542
543 # what shall we do with cgisafe??
544 my $cgisafe = $metadata_string =~ s/^cgisafe://;
545
546 my ($select) = $metadata_string =~ /^(parent|sibling)/;
547 $metadata_string =~ s/^(parent|sibling)//;
548 my ($scope, $delim);
549
550 if ($select) {
551 ($scope) = $metadata_string =~ /^\((Top|All)/;
552 $metadata_string =~ s/^\((Top|All)\)?//;
553 if ($scope) {
554 ($delim) = $metadata_string =~ /^:\'([^\']*)\'\)/;
555 $metadata_string =~ s/^:\'([^\']*)\'\)//;
556 }
557 }
558 $metadata_string =~ s/^://;
559 # remove ex.
560 $metadata_string =~ s/^ex\.//;
561
562 #print STDERR "select=$select, scope=$scope, delim=|$delim|, meta = $metadata_string\n";
563
564 my $new_format = "<gsf:metadata name='$metadata_string' ";
565 if (defined $select) {
566 if ($select eq "sibling") {
567 $new_format .= "multiple='true' ";
568 if (defined $delim) {
569 $new_format .= "separator='$delim' ";
570 }
571 } elsif ($select eq "parent"){
572 if (defined $scope) {
573 if ($scope eq "Top") {
574 $new_format .= "select='root' ";
575 } elsif ($scope eq "All") {
576 $new_format .= "select='ancestors' ";
577 if (defined $delim) {
578 $new_format .= "separator='$delim' ";
579 }
580 }
581 } else {
582 $new_format .= "select='parent' ";
583 }
584 }
585 }
586 $new_format .= "/>";
587 #print STDERR "$new_format\n";
588 return $new_format;
589
590}
591
592sub format_if {
593
594 my ($if_string, $node_type) = @_;
595
596 return "if statement to go here";
597}
598
599sub format_or {
600 my ($or_string) = @_;
601 my @meta_list = split (',', $or_string);
602 return "" unless scalar (@meta_list);
603 my $new_format = "<gsf:choose-metadata>";
604 foreach my $m (@meta_list) {
605 if ($m =~ /^\[(.*)\]$/) {
606 $new_format .= &format_metadata($1);
607 } else {
608 # a default value
609 $new_format .= "<gsf:default>$m</gsf:default>";
610 last;
611 }
612 }
613 $new_format .= "</gsf:choose-metadata>";
614 return $new_format;
615}
616
617sub open_database {
618 my ($db_file) = @_;
619
620 my $database = ();
621 tie (%$database, 'GDBM_File', $db_file, &GDBM_READER, 0400) ||
622 die "Couldn't open database $db_file\n";
623
624 return $database;
625}
626
627sub close_database {
628 my ($database) = @_;
629 untie %$database;
630}
631sub isHorizontalClassifier {
632 my ($database, $name) = @_;
633
634 my $record = $database->{$name};
635 my ($childtype) = $record =~ /<childtype>(\w*)/;
636 if ($childtype eq "HList") {
637 return 1;
638 }
639 return 0;
640}
641#$writer->startTag('');
642#$writer->endTag('');
643#$writer->characters();
644#$writer->emptyTag('');
645
6461;
Note: See TracBrowser for help on using the repository browser.