source: gs2-extensions/parallel-building/trunk/src/src/txt2dbl-src/txt2dbl.cpp@ 24618

Last change on this file since 24618 was 24618, checked in by jmt12, 13 years ago

Adding the version of GDBM txt2db with simply file locking so as to support parallel building (albeit slowly)

File size: 7.4 KB
Line 
1/**********************************************************************
2 *
3 * txt2db.cpp --
4 * A component of the Greenstone digital library software
5 * from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Copyright (C) 1999 The New Zealand Digital Library Project
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 **********************************************************************/
25
26
27#ifdef __WIN32__
28#include "autoconf.h"
29#include "systems.h"
30#include "gdbmconst.h"
31#include "gdbm.h"
32
33#else
34#include <gdbm.h>
35#endif
36
37//#include "gsdlconf.h"
38#include "text_t.h"
39#include <stdlib.h>
40#include <cstring>
41#include <fcntl.h>
42
43#if defined(GSDL_USE_OBJECTSPACE)
44# include <ospace\std\iostream>
45#elif defined(GSDL_USE_IOS_H)
46# include <iostream.h>
47#else
48# include <iostream>
49#endif
50
51using namespace std;
52
53void print_usage (char *program_name) {
54 cerr << "usage: " << program_name << " database-name" << endl;
55 cerr << "usage: " << program_name << " -update database-name" << endl;
56 cerr << "usage: " << program_name << " -append database-name" << endl << endl;
57 cerr << "options:" << endl;
58 cerr << " -update update existing database" << endl;
59 cerr << " -append legacy alias for -update" << endl << endl;
60}
61
62// lock a file on linux
63// [hs, 2 july 2010]
64int lock ( char *filename )
65{
66 ///out << "txt2dbl::lock(" << filename << ") => ";
67 int fd2 = open (filename, O_CREAT|O_RDWR, 00644);
68 close (fd2);
69 int fd = open (filename, O_RDWR);
70 flock lock = {F_WRLCK, SEEK_SET, 0, 0, 0};
71 fcntl (fd, F_SETLKW, &lock);
72 ///out << "locked!" << endl;
73 return fd;
74}
75
76// unlock a file on linux
77// [hs, 2 july 2010]
78int unlock ( int fd )
79{
80 ///out << "txt2dbl::unlock() => ";
81 flock lock1 = {F_UNLCK, SEEK_SET, 0, 0, 0};
82 fcntl (fd, F_SETLKW, &lock1);
83 ///out << "unlocked!" << endl;
84 return 0;
85}
86
87int main (int argc, char *argv[]) {
88
89 ///out << "===== TXT2DB+Locking=====" << endl;
90
91 int block_size = 0;
92 GDBM_FILE dbf;
93 char c;
94 text_t key;
95 text_t value;
96 text_t tmp;
97 int num_dashes = 0;
98
99 // sanity check
100 if (argc != 2 && argc != 3) {
101 print_usage (argv[0]);
102 exit (0);
103 }
104
105 char *dbname;
106 int update = 0;
107 int delkey = 0;
108 int switched_flags = 0;
109
110 if (argc == 3) {
111 // legacy support
112 if (strcmp (argv[1], "-append") == 0) {
113 update = 1;
114 dbname = argv[2];
115 } else if (strcmp (argv[1], "-update") == 0) {
116 update = 1;
117 dbname = argv[2];
118 } else {
119 cerr << argv[1] << " is not a valid option." << endl << endl;
120 print_usage (argv[0]);
121 exit (0);
122 }
123 } else dbname = argv[1];
124
125
126 // open the database
127 // note that GDBM_FAST is obsolete on newer versions of gdbm
128 int read_write = GDBM_NEWDB | GDBM_FAST;
129 if (update) read_write = GDBM_WRCREAT | GDBM_FAST;
130
131/*
132#ifdef __WIN32__
133 dbf = gdbm_open (dbname, block_size, read_write, 00664, NULL, 1);
134#else
135 dbf = gdbm_open (dbname, block_size, read_write, 00664, NULL);
136#endif
137 if (dbf == NULL) {
138 cerr << "couldn't create " << dbname << endl;
139 exit (0);
140 }
141*/
142
143 cin.get(c);
144 while (!cin.eof()) {
145 num_dashes = 0;
146 key = "";
147 value = "";
148
149 // Parse out 'key' from [key]\n
150
151 // scan for first occurrence of [
152 while (!cin.eof() && c != '[') cin.get(c);
153
154 if (!cin.eof()) cin.get(c); // skip [
155
156 // now look for closing ], building up 'key' as we go
157 while (!cin.eof() && c != ']') {
158 key.push_back ((unsigned char)c);
159 cin.get(c);
160 }
161
162 if (!cin.eof()) {
163 // most likely an eol char, but if '-', then signifies record
164 // is to be deleted, not added
165 cin.get(c);
166 if (c == '-') {
167 delkey = 1;
168 }
169 else {
170 delkey = 0;
171 }
172 }
173 while (!cin.eof() && (c == '\n' || c == '\r')) cin.get(c);
174
175 // look for 70 dashes
176 tmp = "";
177 while (!cin.eof() && (num_dashes < 70)) {
178 if (c == '\n') {
179 tmp.push_back ((unsigned char)c);
180 num_dashes = 0;
181
182 } else if (c == '\r') {
183 // Here we are able to process both Windows-specific text files
184 // (containing carriage-return, newline) and Linux text files
185 // (containing only newline characters) by ignoring the Windows'
186 // carriage-return altogether so that we produce a uniform database
187 // file from either system's type of text file.
188 // If we don't ignore the carriage return here, txt.gz files
189 // produced on Windows cause a GS library running on Linux to break.
190 num_dashes = 0;
191
192 } else if (c == '-') {
193 tmp.push_back ((unsigned char)c);
194 ++num_dashes;
195
196 } else {
197 value += tmp;
198 value.push_back ((unsigned char)c);
199 tmp = "";
200 num_dashes = 0;
201 }
202 cin.get(c);
203 }
204
205 // if the key is not an empty string store this key-value pair
206 if (!key.empty()) {
207 // convert key to a datum datatype
208 datum key_data;
209 key_data.dptr = key.getcstr();
210 if (key_data.dptr == NULL) {
211 cerr << "NULL key_data.dptr" << endl;
212 exit (0);
213 }
214 key_data.dsize = strlen(key_data.dptr);
215 // moved for better localisation at the expense of some speed
216 // and lock before the operation
217 // [hs, 2 july 2010]
218 int thelock = lock ("gdb.lock");
219#ifdef __WIN32__
220 dbf = gdbm_open (dbname, block_size, read_write, 00664, NULL, 1);
221#else
222 dbf = gdbm_open (dbname, block_size, read_write, 00664, NULL);
223#endif
224 if (dbf == NULL) {
225 cerr << "couldn't create " << dbname << endl;
226 exit (0);
227 }
228 // If opening was successful the first time, and regardless of what flags
229 // we may have been given, we must make future opens '-update' (rather
230 // than erase the file over and over!)
231 if (!update && !switched_flags)
232 {
233 read_write = GDBM_WRCREAT | GDBM_FAST;
234 switched_flags = 1;
235 }
236
237 if (delkey) {
238 // delete the given key
239 if (gdbm_delete(dbf, key_data) < 0) {
240 cerr << "gdbm_delete returned an error" << endl;
241 }
242 }
243 else {
244
245 // add/update
246
247 // convert value to a datum datatype
248 datum value_data;
249 value_data.dptr = value.getcstr();
250 if (value_data.dptr == NULL) {
251 cerr << "NULL value_data.dptr" << endl;
252 exit (0);
253 }
254 value_data.dsize = strlen(value_data.dptr);
255
256 // store the value
257 if (gdbm_store (dbf, key_data, value_data, GDBM_REPLACE) < 0) {
258 cerr << "gdbm_store returned an error" << endl;
259 exit (0);
260 }
261
262 free(value_data.dptr);
263 }
264
265 // moved for better localisation at the expense of some speed
266 // and unlock after the operation
267 // [hs, 2 july 2010]
268 gdbm_close (dbf);
269 unlock (thelock);
270
271 free(key_data.dptr);
272 }
273 }
274/* gdbm_close (dbf); */
275
276 return 0;
277}
Note: See TracBrowser for help on using the repository browser.