/********************************************************************** * * txt2tdb.cpp -- A utility to convert a stream of text, ala buildproc * encoded output, into a TDB file. * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * Copyright (C) 2011 The New Zealand Digital Library Project * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **********************************************************************/ #if defined(GSDL_USE_OBJECTSPACE) #include #elif defined(GSDL_USE_IOS_H) #include #else #include #endif #include "tdb.h" #include "text_t.h" // use the standard namespace #if !defined (GSDL_NAMESPACE_BROKEN) #if defined(GSDL_USE_OBJECTSPACE) using namespace ospace::std; #else using namespace std; #endif #endif void printUsage (char *program_name) { cerr << "usage: " << program_name << " [options] database-name" << endl << endl; cerr << "options:" << endl; cerr << " -append append to existing database" << endl << endl; } /** printUsage() **/ int main (int argc, char *argv[]) { // sanity check if (argc != 2 && argc != 3) { printUsage (argv[0]); exit (0); } char *dbname; int append = 0; int delkey = 0; if (argc == 3) { if (strcmp (argv[1], "-append") == 0) { append = 1; dbname = argv[2]; } else { cerr << argv[1] << " is not a valid option." << endl << endl; printUsage(argv[0]); exit (0); } } else { dbname = argv[1]; } // open the database int hash_size = 0; int tdb_flags = TDB_DEFAULT; // Default = 0 if (append == 0) { tdb_flags = TDB_CLEAR_IF_FIRST; } int tdb_store_flags = TDB_DEFAULT; // used later when storing int open_flags = O_RDWR | O_CREAT; TDB_CONTEXT *tdb = tdb_open(dbname, hash_size, tdb_flags, open_flags, 0664); if (!tdb) { cerr << "couldn't create " << dbname << endl; exit (0); } char c; cin.get(c); while (!cin.eof()) { int num_dashes = 0; text_t key = ""; text_t value = ""; // Parse out 'key' from [key]\n // - scan for first occurrence of [ while (!cin.eof() && c != '[') { cin.get(c); } // - skip [ if (!cin.eof()) { cin.get(c); } // - now look for closing ], building up 'key' as we go while (!cin.eof() && c != ']') { key.push_back ((unsigned char)c); cin.get(c); } if (!cin.eof()) { // most likely an eol char, but if '-', then signifies record // is to be deleted, not added cin.get(c); if (c == '-') { delkey = 1; } else { delkey = 0; } } while (!cin.eof() && (c == '\n' || c == '\r')) { cin.get(c); } // - read in the value, watching for 70 dashes (the end) text_t tmp = ""; while (!cin.eof() && (num_dashes < 70)) { if (c == '\n') { tmp.push_back ((unsigned char)c); num_dashes = 0; } else if (c == '\r') { // Here we are able to process both Windows-specific text files // (containing carriage-return, newline) and Linux text files // (containing only newline characters) by ignoring the Windows' // carriage-return altogether so that we produce a uniform database // file from either system's type of text file. // If we don't ignore the carriage return here, txt.gz files // produced on Windows cause a GS library running on Linux to break. num_dashes = 0; } else if (c == '-') { tmp.push_back ((unsigned char)c); ++num_dashes; } else { value += tmp; value.push_back ((unsigned char)c); tmp = ""; num_dashes = 0; } cin.get(c); } // We should now have a key/value pair. If the key is not an empty string // store this key-value pair if (!key.empty()) { // convert key to a datum datatype TDB_DATA key_data; // [why are cstrings from text_t not unsigned? from what I can see we // explicitly cast to char * when we return] key_data.dptr = (unsigned char*)key.getcstr(); if (key_data.dptr == NULL) { cerr << "NULL key_data.dptr" << endl; exit (0); } key_data.dsize = key.size(); // - delete the given key if we've been asked to if (delkey) { if (tdb_delete(tdb, key_data) < 0) { cerr << "tdb_delete returned an error" << endl; } } // - otherwise add else { // - convert value to a datum datatype TDB_DATA value_data; value_data.dptr = (unsigned char*)value.getcstr(); if (value_data.dptr == NULL) { cerr << "NULL value_data.dptr" << endl; exit (0); } value_data.dsize = value.size(); // - store the value if (tdb_store(tdb, key_data, value_data, tdb_store_flags) < 0) { cerr << "tdb_store returned an error" << endl; exit (0); } } } } // Close the database connection if (tdb_close(tdb) < 0) { cerr << "tdb_close returned an error" << endl; exit (0); } return 0; }