source: gs2-extensions/tdb/trunk/src/txt2tdb-src/txt2tdb.cpp@ 30369

Last change on this file since 30369 was 30369, checked in by jmt12, 8 years ago

Rewrite of argument parsing code to be a little more forgiving in terms of argument order

File size: 7.6 KB
Line 
1/**********************************************************************
2 *
3 * txt2tdb.cpp -- A utility to convert a stream of text, ala buildproc
4 * encoded output, into a TDB file.
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * Copyright (C) 2011 The New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 **********************************************************************/
27
28#if defined(GSDL_USE_OBJECTSPACE)
29#include <ospace\std\iostream>
30#elif defined(GSDL_USE_IOS_H)
31#include <iostream.h>
32#else
33#include <iostream>
34#endif
35
36#include <cstdlib>
37#include <cstring>
38
39#include <time.h>
40
41#include "tdb.h"
42#include "text_t.h"
43
44// use the standard namespace
45#if !defined (GSDL_NAMESPACE_BROKEN)
46#if defined(GSDL_USE_OBJECTSPACE)
47using namespace ospace::std;
48#else
49using namespace std;
50#endif
51#endif
52
53/**
54 */
55void
56printUsage (char *program_name)
57{
58 cerr << "usage: " << program_name << " database-name [-append] [-debug]" << endl << endl;
59 cerr << "options:" << endl;
60 cerr << " -append append to existing database" << endl;
61 cerr << " -debug add timing information to database" << endl << endl;
62}
63/** printUsage() **/
64
65/*
66void
67debugLog(TDB_CONTEXT * tdb, char * msg_content)
68{
69 // Since this log will be used to track order of events, we need an indicator
70 // of time
71 time_t seconds = time(NULL);
72 // We also need some idea of what thread this is - let's try and use the PID
73 pid_t process_id = getpid();
74 // Append the message to the entry in the db (fixed key "debuglog")
75 TDB_DATA key_datum;
76 key_datum.dptr = (unsigned char *)"debuglog";
77 key_datum.dsize = 8;
78 text_t message = "[" + text_t(seconds) + "][" + text_t(process_id) + "] " + msg_content + "\n";
79 TDB_DATA msg_datum;
80 msg_datum.dptr = (unsigned char *) message.getcstr();
81 msg_datum.dsize = message.size();
82 if (tdb_append(tdb, key_datum, msg_datum) != 0)
83 {
84 cerr << "txt2tdb::debugLog() - tdb_append returned an error" << endl;
85 exit (0);
86 }
87}
88*/
89/** debugLog() **/
90
91/**
92 */
93int
94main (int argc, char *argv[])
95{
96 // sanity check
97 if (2 > argc || argc > 4)
98 {
99 if (2 > argc)
100 {
101 cerr << "Not enough arguments." << endl << endl;
102 }
103 else
104 {
105 cerr << "Too many arguments." << endl << endl;
106 }
107 printUsage (argv[0]);
108 exit (0);
109 }
110
111 char *dbname;
112 bool found_dbname = false;
113 int append = 0;
114 int delkey = 0;
115 int debug = 0;
116 for (int i = 1; i < argc; i++)
117 {
118 if (strcmp(argv[i], "-append") == 0)
119 {
120 append = 1;
121 }
122 else if (strcmp(argv[i], "-debug") == 0)
123 {
124 debug = 1;
125 }
126 else
127 {
128 dbname = argv[i];
129 found_dbname = true;
130 }
131 }
132
133 if (!found_dbname)
134 {
135 cerr << "Database path not specified." << endl << endl;
136 printUsage(argv[0]);
137 exit(0);
138 }
139
140 // open the database
141 int hash_size = 0;
142 int tdb_flags = TDB_DEFAULT; // Default = 0
143 if (append == 0)
144 {
145 tdb_flags = TDB_CLEAR_IF_FIRST;
146 }
147 // Disable file IO for testing purposes
148 /*tdb_flags = tdb_flags | TDB_INTERNAL;*/
149
150 int tdb_store_flags = TDB_DEFAULT; // used later when storing
151 int open_flags = O_RDWR | O_CREAT;
152 TDB_CONTEXT *tdb = tdb_open(dbname, hash_size, tdb_flags, open_flags, 0664);
153 if (!tdb)
154 {
155 cerr << "txt2tdb::main() - couldn't create " << dbname << endl;
156 exit (0);
157 }
158
159 // If we are debugging, we'll write that we just opened the connection
160 /*
161 if (debug)
162 {
163 debugLog(tdb, "opened connection to database for read/write");
164 }
165 */
166
167 char c;
168 cin.get(c);
169 while (!cin.eof())
170 {
171 int num_dashes = 0;
172 text_t key = "";
173 text_t value = "";
174
175 // Parse out 'key' from [key]\n
176 // - scan for first occurrence of [
177 while (!cin.eof() && c != '[')
178 {
179 cin.get(c);
180 }
181 // - skip [
182 if (!cin.eof())
183 {
184 cin.get(c);
185 }
186 // - now look for closing ], building up 'key' as we go
187 while (!cin.eof() && c != ']')
188 {
189 key.push_back ((unsigned char)c);
190 cin.get(c);
191 }
192 if (!cin.eof())
193 {
194 // most likely an eol char, but if '-', then signifies record
195 // is to be deleted, not added
196 cin.get(c);
197 if (c == '-')
198 {
199 delkey = 1;
200 }
201 else
202 {
203 delkey = 0;
204 }
205 }
206 while (!cin.eof() && (c == '\n' || c == '\r'))
207 {
208 cin.get(c);
209 }
210 // - read in the value, watching for 70 dashes (the end)
211 text_t tmp = "";
212 while (!cin.eof() && (num_dashes < 70))
213 {
214 if (c == '\n')
215 {
216 tmp.push_back ((unsigned char)c);
217 num_dashes = 0;
218 }
219 else if (c == '\r')
220 {
221 // Here we are able to process both Windows-specific text files
222 // (containing carriage-return, newline) and Linux text files
223 // (containing only newline characters) by ignoring the Windows'
224 // carriage-return altogether so that we produce a uniform database
225 // file from either system's type of text file.
226 // If we don't ignore the carriage return here, txt.gz files
227 // produced on Windows cause a GS library running on Linux to break.
228 num_dashes = 0;
229 }
230 else if (c == '-')
231 {
232 tmp.push_back ((unsigned char)c);
233 ++num_dashes;
234 }
235 else
236 {
237 value += tmp;
238 value.push_back ((unsigned char)c);
239 tmp = "";
240 num_dashes = 0;
241 }
242 cin.get(c);
243 }
244
245 // We should now have a key/value pair. If the key is not an empty string
246 // store this key-value pair
247 if (!key.empty())
248 {
249 // convert key to a datum datatype
250 TDB_DATA key_data;
251 // [why are cstrings from text_t not unsigned? from what I can see we
252 // explicitly cast to char * when we return]
253 key_data.dptr = (unsigned char*)key.getcstr();
254 if (key_data.dptr == NULL)
255 {
256 cerr << "NULL key_data.dptr" << endl;
257 exit (0);
258 }
259 key_data.dsize = key.size();
260 // - delete the given key if we've been asked to
261 if (delkey)
262 {
263 if (tdb_delete(tdb, key_data) < 0)
264 {
265 cerr << "tdb_delete returned an error" << endl;
266 }
267 }
268 // - otherwise add
269 else
270 {
271 // - convert value to a datum datatype
272 TDB_DATA value_data;
273 value_data.dptr = (unsigned char*)value.getcstr();
274 if (value_data.dptr == NULL)
275 {
276 cerr << "NULL value_data.dptr" << endl;
277 exit (0);
278 }
279 value_data.dsize = value.size();
280 // - store the value
281 if (tdb_store(tdb, key_data, value_data, tdb_store_flags) < 0)
282 {
283 cerr << "tdb_store returned an error" << endl;
284 exit (0);
285 }
286 }
287 }
288 }
289
290 // If we are debugging, we'll write that we are about to close the connection
291 /*
292 if (debug)
293 {
294 debugLog(tdb, "closing connection to database");
295 }
296 */
297
298 // Close the database connection
299 if (tdb_close(tdb) < 0)
300 {
301 cerr << "tdb_close returned an error" << endl;
302 exit (0);
303 }
304
305 return 0;
306}
Note: See TracBrowser for help on using the repository browser.