source: gs2-extensions/tdb/trunk/src/txt2tdb-src/txt2tdb.cpp@ 30217

Last change on this file since 30217 was 24696, checked in by jmt12, 13 years ago

Removing debug code, and adding in strings.h header to improve portability (although I guess it also make portability worse on Windoze - oh well)

File size: 7.7 KB
Line 
1/**********************************************************************
2 *
3 * txt2tdb.cpp -- A utility to convert a stream of text, ala buildproc
4 * encoded output, into a TDB file.
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * Copyright (C) 2011 The New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 **********************************************************************/
27
28#if defined(GSDL_USE_OBJECTSPACE)
29#include <ospace\std\iostream>
30#elif defined(GSDL_USE_IOS_H)
31#include <iostream.h>
32#else
33#include <iostream>
34#endif
35
36#include <cstdlib>
37#include <cstring>
38
39#include <time.h>
40
41#include "tdb.h"
42#include "text_t.h"
43
44// use the standard namespace
45#if !defined (GSDL_NAMESPACE_BROKEN)
46#if defined(GSDL_USE_OBJECTSPACE)
47using namespace ospace::std;
48#else
49using namespace std;
50#endif
51#endif
52
53/**
54 */
55void
56printUsage (char *program_name)
57{
58 cerr << "usage: " << program_name << " [-append] database-name [-debug]" << endl << endl;
59 cerr << "options:" << endl;
60 cerr << " -append append to existing database" << endl;
61 cerr << " -debug add timing information to database" << endl << endl;
62}
63/** printUsage() **/
64
65/*
66void
67debugLog(TDB_CONTEXT * tdb, char * msg_content)
68{
69 // Since this log will be used to track order of events, we need an indicator
70 // of time
71 time_t seconds = time(NULL);
72 // We also need some idea of what thread this is - let's try and use the PID
73 pid_t process_id = getpid();
74 // Append the message to the entry in the db (fixed key "debuglog")
75 TDB_DATA key_datum;
76 key_datum.dptr = (unsigned char *)"debuglog";
77 key_datum.dsize = 8;
78 text_t message = "[" + text_t(seconds) + "][" + text_t(process_id) + "] " + msg_content + "\n";
79 TDB_DATA msg_datum;
80 msg_datum.dptr = (unsigned char *) message.getcstr();
81 msg_datum.dsize = message.size();
82 if (tdb_append(tdb, key_datum, msg_datum) != 0)
83 {
84 cerr << "txt2tdb::debugLog() - tdb_append returned an error" << endl;
85 exit (0);
86 }
87}
88*/
89/** debugLog() **/
90
91/**
92 */
93int
94main (int argc, char *argv[])
95{
96 // sanity check
97 if (2 > argc || argc > 4)
98 {
99 printUsage (argv[0]);
100 exit (0);
101 }
102
103 char *dbname;
104 int append = 0;
105 int delkey = 0;
106 int debug = 0;
107 if (argc == 3)
108 {
109 if (strcmp (argv[1], "-append") == 0)
110 {
111 append = 1;
112 dbname = argv[2];
113 }
114 else if (strcmp(argv[2], "-debug") == 0)
115 {
116 dbname = argv[1];
117 debug = 1;
118 }
119 else
120 {
121 cerr << argv[1] << " is not a valid option." << endl << endl;
122 printUsage(argv[0]);
123 exit (0);
124 }
125 }
126 else if (argc == 4)
127 {
128 if (strcmp (argv[1], "-append") == 0 && strcmp (argv[3], "-debug") == 0)
129 {
130 append = 1;
131 dbname = argv[2];
132 debug = 1;
133 }
134 else
135 {
136 cerr << argv[1] << " is not a valid option." << endl << endl;
137 printUsage(argv[0]);
138 exit (0);
139 }
140 }
141 else
142 {
143 dbname = argv[1];
144 }
145
146 // open the database
147 int hash_size = 0;
148 int tdb_flags = TDB_DEFAULT; // Default = 0
149 if (append == 0)
150 {
151 tdb_flags = TDB_CLEAR_IF_FIRST;
152 }
153 // Disable file IO for testing purposes
154 /*tdb_flags = tdb_flags | TDB_INTERNAL;*/
155
156 int tdb_store_flags = TDB_DEFAULT; // used later when storing
157 int open_flags = O_RDWR | O_CREAT;
158 TDB_CONTEXT *tdb = tdb_open(dbname, hash_size, tdb_flags, open_flags, 0664);
159 if (!tdb)
160 {
161 cerr << "txt2tdb::main() - couldn't create " << dbname << endl;
162 exit (0);
163 }
164
165 // If we are debugging, we'll write that we just opened the connection
166 /*
167 if (debug)
168 {
169 debugLog(tdb, "opened connection to database for read/write");
170 }
171 */
172
173 char c;
174 cin.get(c);
175 while (!cin.eof())
176 {
177 int num_dashes = 0;
178 text_t key = "";
179 text_t value = "";
180
181 // Parse out 'key' from [key]\n
182 // - scan for first occurrence of [
183 while (!cin.eof() && c != '[')
184 {
185 cin.get(c);
186 }
187 // - skip [
188 if (!cin.eof())
189 {
190 cin.get(c);
191 }
192 // - now look for closing ], building up 'key' as we go
193 while (!cin.eof() && c != ']')
194 {
195 key.push_back ((unsigned char)c);
196 cin.get(c);
197 }
198 if (!cin.eof())
199 {
200 // most likely an eol char, but if '-', then signifies record
201 // is to be deleted, not added
202 cin.get(c);
203 if (c == '-')
204 {
205 delkey = 1;
206 }
207 else
208 {
209 delkey = 0;
210 }
211 }
212 while (!cin.eof() && (c == '\n' || c == '\r'))
213 {
214 cin.get(c);
215 }
216 // - read in the value, watching for 70 dashes (the end)
217 text_t tmp = "";
218 while (!cin.eof() && (num_dashes < 70))
219 {
220 if (c == '\n')
221 {
222 tmp.push_back ((unsigned char)c);
223 num_dashes = 0;
224 }
225 else if (c == '\r')
226 {
227 // Here we are able to process both Windows-specific text files
228 // (containing carriage-return, newline) and Linux text files
229 // (containing only newline characters) by ignoring the Windows'
230 // carriage-return altogether so that we produce a uniform database
231 // file from either system's type of text file.
232 // If we don't ignore the carriage return here, txt.gz files
233 // produced on Windows cause a GS library running on Linux to break.
234 num_dashes = 0;
235 }
236 else if (c == '-')
237 {
238 tmp.push_back ((unsigned char)c);
239 ++num_dashes;
240 }
241 else
242 {
243 value += tmp;
244 value.push_back ((unsigned char)c);
245 tmp = "";
246 num_dashes = 0;
247 }
248 cin.get(c);
249 }
250
251 // We should now have a key/value pair. If the key is not an empty string
252 // store this key-value pair
253 if (!key.empty())
254 {
255 // convert key to a datum datatype
256 TDB_DATA key_data;
257 // [why are cstrings from text_t not unsigned? from what I can see we
258 // explicitly cast to char * when we return]
259 key_data.dptr = (unsigned char*)key.getcstr();
260 if (key_data.dptr == NULL)
261 {
262 cerr << "NULL key_data.dptr" << endl;
263 exit (0);
264 }
265 key_data.dsize = key.size();
266 // - delete the given key if we've been asked to
267 if (delkey)
268 {
269 if (tdb_delete(tdb, key_data) < 0)
270 {
271 cerr << "tdb_delete returned an error" << endl;
272 }
273 }
274 // - otherwise add
275 else
276 {
277 // - convert value to a datum datatype
278 TDB_DATA value_data;
279 value_data.dptr = (unsigned char*)value.getcstr();
280 if (value_data.dptr == NULL)
281 {
282 cerr << "NULL value_data.dptr" << endl;
283 exit (0);
284 }
285 value_data.dsize = value.size();
286 // - store the value
287 if (tdb_store(tdb, key_data, value_data, tdb_store_flags) < 0)
288 {
289 cerr << "tdb_store returned an error" << endl;
290 exit (0);
291 }
292 }
293 }
294 }
295
296 // If we are debugging, we'll write that we are about to close the connection
297 /*
298 if (debug)
299 {
300 debugLog(tdb, "closing connection to database");
301 }
302 */
303
304 // Close the database connection
305 if (tdb_close(tdb) < 0)
306 {
307 cerr << "tdb_close returned an error" << endl;
308 exit (0);
309 }
310
311 return 0;
312}
Note: See TracBrowser for help on using the repository browser.