source: gsdl/trunk/bin/script/convert_collection.pl@ 18470

Last change on this file since 18470 was 9834, checked in by kjdon, 19 years ago

changed the DTD ref from GreenstoneDirectoryMetadata to DirectoryMetadata

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 2.9 KB
Line 
1#! /usr/bin/perl -w
2
3# converts an old style humanity collection which uses an index.txt file to
4# use a single metadata.xml file instead
5
6
7BEGIN {
8 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
9 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
10}
11
12use util;
13use cfgread;
14
15my $collection = $ARGV[0];
16my $collectdir = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection);
17my $importdir = &util::filename_cat($collectdir, "import");
18
19die unless -d $importdir;
20
21# new import structure will be created in "import.new" directory
22my $importnewdir = $importdir . ".new";
23`mkdir $importnewdir`;
24
25
26# read in index.txt file and generate metadata.xml, in the process
27# converting the html files and copying them across to the import.new
28# directory
29
30my $metadata_xml = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
31$metadata_xml .= "<!DOCTYPE DirectoryMetadata SYSTEM " .
32 "\"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n";
33$metadata_xml .= "<DirectoryMetadata>\n\n";
34
35my $index_txt = &util::filename_cat($importdir, "index.txt");
36
37open (INDEXTXT, $index_txt) || die;
38
39my $line = [];
40my @fields = ();
41my $count = 0;
42while (defined ($line = cfgread::read_cfg_line("main::INDEXTXT"))) {
43
44# last if $count > 10;
45
46 if ($line->[0] eq "key:") {
47 shift @$line;
48 @fields = @$line;
49 } else {
50
51 my $jobnumber = shift @$line;
52 &new_document($jobnumber);
53 $count ++;
54
55 my $i = 0;
56 for ($i = 0; $i < scalar(@$line); $i++) {
57 if ($line->[$i] =~ /^<([^>]+)>(.*)$/) {
58 &set_metadata($1, $2);
59 } else {
60 if (defined ($fields[$i])) {
61 &set_metadata($fields[$i], $line->[$i]);
62 } else {
63 print STDERR "error 1\n";
64 }
65 }
66 }
67
68 $metadata_xml .= " </Description>\n";
69 $metadata_xml .= " </FileSet>\n\n";
70 }
71}
72close INDEXTXT;
73
74$metadata_xml .= "</DirectoryMetadata>\n";
75
76my $metafile = &util::filename_cat($importnewdir, "metadata.xml");
77open (META, ">$metafile") || die;
78print META $metadata_xml;
79close META;
80
81
82sub new_document {
83 my ($jobnumber) = @_;
84
85 print STDERR "creating new document ($jobnumber)\n";
86
87 my $docdir = &util::filename_cat($importdir, $jobnumber);
88 die unless -d $docdir;
89
90 # copy whole directory across to import.new
91 $jobnumber =~ s/^.*?\///;
92 my $newdocdir = &util::filename_cat($importnewdir, $jobnumber);
93 die if -e $newdocdir;
94 `cp -r $docdir $newdocdir`;
95
96 # convert the htm file to use the new syntax
97 my $htmfile = &util::filename_cat($newdocdir, "$jobnumber.htm");
98 die unless -e $htmfile;
99 `convert_toc.pl < $htmfile > $htmfile.new`;
100 `mv $htmfile.new $htmfile`;
101
102 # update metadata.xml
103 $metadata_xml .= " <FileSet>\n";
104 $metadata_xml .= " <FileName>$jobnumber</FileName>\n";
105 $metadata_xml .= " <Description>\n";
106}
107
108sub set_metadata {
109 my ($key, $value) = @_;
110
111 $metadata_xml .= " <Metadata name=\"$key\" mode=\"accumulate\">$value</Metadata>\n";
112}
Note: See TracBrowser for help on using the repository browser.