[3745] | 1 | /**************************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mg_text_merge.c --- merge *.text, *.text.idx files
|
---|
| 4 | * part of the mgmerge utility
|
---|
| 5 | * Copyright (C) 1995 Shane Hudson ([email protected])
|
---|
| 6 | *
|
---|
| 7 | * This program is free software; you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * This program is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with this program; if not, write to the Free Software
|
---|
| 19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 20 | *
|
---|
| 21 | * $Id: mg_text_merge.c 3745 2003-02-20 21:20:24Z mdewsnip $
|
---|
| 22 | * Last edited: November 11 1994
|
---|
| 23 | *
|
---|
| 24 | **************************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "sysfuncs.h"
|
---|
| 27 |
|
---|
| 28 | #include "messages.h"
|
---|
| 29 | #include "timing.h"
|
---|
| 30 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 31 |
|
---|
| 32 | #include "locallib.h"
|
---|
| 33 | #include "mg.h"
|
---|
| 34 | #include "mg_merge.h"
|
---|
| 35 | #include "mg_files.h"
|
---|
| 36 | #include "text.h"
|
---|
| 37 |
|
---|
| 38 |
|
---|
| 39 | /**** GLOBALS ****/
|
---|
| 40 | FILE *text[3], *idx[3];
|
---|
| 41 |
|
---|
| 42 | typedef char FileName[256];
|
---|
| 43 | FileName old_name, new_name, merge_name;
|
---|
| 44 |
|
---|
| 45 | long magicsize; /* == where the header in a file begins */
|
---|
| 46 |
|
---|
| 47 | compressed_text_header cth[3];
|
---|
| 48 |
|
---|
| 49 | /*=======================================================================
|
---|
| 50 | * init_merge_text(): open files, set up global variables, etc
|
---|
| 51 | *=======================================================================*/
|
---|
| 52 | int
|
---|
| 53 | init_merge_text ()
|
---|
| 54 | {
|
---|
| 55 |
|
---|
| 56 | /* open .text files */
|
---|
| 57 | text[OLD] = open_file (old_name, TEXT_SUFFIX, "r+b",
|
---|
| 58 | MAGIC_TEXT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 59 | magicsize = ftell (text[OLD]);
|
---|
| 60 | fread (&cth[OLD], sizeof (cth[OLD]), 1, text[OLD]);
|
---|
| 61 |
|
---|
| 62 | text[NEW] = open_file (new_name, TEXT_SUFFIX, "rb+",
|
---|
| 63 | MAGIC_TEXT, MG_ABORT);
|
---|
| 64 | fread (&cth[NEW], sizeof (cth[NEW]), 1, text[NEW]);
|
---|
| 65 |
|
---|
| 66 | /* open .text.idx files */
|
---|
| 67 | idx[OLD] = open_file (old_name, TEXT_IDX_SUFFIX, "rb+",
|
---|
| 68 | MAGIC_TEXI, MG_ABORT);
|
---|
| 69 | fread (&cth[OLD], sizeof (cth[OLD]), 1, idx[OLD]);
|
---|
| 70 |
|
---|
| 71 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 72 | NTOHUL(cth[OLD].num_of_docs);
|
---|
| 73 | NTOHD(cth[OLD].num_of_bytes); /* [RJM 07/97: 4G limit] */
|
---|
| 74 | NTOHUL(cth[OLD].num_of_words);
|
---|
| 75 | NTOHUL(cth[OLD].length_of_longest_doc);
|
---|
| 76 | NTOHD(cth[OLD].ratio);
|
---|
| 77 |
|
---|
| 78 | idx[NEW] = open_file (new_name, TEXT_IDX_SUFFIX, "rb+",
|
---|
| 79 | MAGIC_TEXI, MG_ABORT);
|
---|
| 80 | fread (&cth[NEW], sizeof (cth[NEW]), 1, idx[NEW]);
|
---|
| 81 |
|
---|
| 82 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 83 | NTOHUL(cth[NEW].num_of_docs);
|
---|
| 84 | NTOHD(cth[NEW].num_of_bytes); /* [RJM 07/97: 4G limit] */
|
---|
| 85 | NTOHUL(cth[NEW].num_of_words);
|
---|
| 86 | NTOHUL(cth[NEW].length_of_longest_doc);
|
---|
| 87 | NTOHD(cth[NEW].ratio);
|
---|
| 88 |
|
---|
| 89 | idx[MERGE] = create_file (merge_name, TEXT_IDX_SUFFIX, "wb",
|
---|
| 90 | MAGIC_TEXI, MG_ABORT);
|
---|
| 91 | return OK;
|
---|
| 92 | }
|
---|
| 93 |
|
---|
| 94 |
|
---|
| 95 | /*=======================================================================
|
---|
| 96 | * process_merge_text(): merge the files
|
---|
| 97 | *=======================================================================*/
|
---|
| 98 | int
|
---|
| 99 | process_merge_text (void)
|
---|
| 100 | {
|
---|
| 101 | int i;
|
---|
| 102 | u_long data, offset;
|
---|
| 103 | byte c;
|
---|
| 104 |
|
---|
| 105 | /* update and write merged header to .text and .text.idx files */
|
---|
| 106 | /* they have the exact same header */
|
---|
| 107 | cth[MERGE].num_of_docs = cth[OLD].num_of_docs
|
---|
| 108 | + cth[NEW].num_of_docs;
|
---|
| 109 | cth[MERGE].num_of_bytes = cth[OLD].num_of_bytes
|
---|
| 110 | + cth[NEW].num_of_bytes;
|
---|
| 111 | cth[MERGE].num_of_words = cth[OLD].num_of_words;
|
---|
| 112 | cth[MERGE].length_of_longest_doc =
|
---|
| 113 | (cth[OLD].length_of_longest_doc > cth[NEW].length_of_longest_doc
|
---|
| 114 | ? cth[OLD].length_of_longest_doc
|
---|
| 115 | : cth[NEW].length_of_longest_doc);
|
---|
| 116 | cth[MERGE].ratio = ((cth[OLD].num_of_bytes * cth[OLD].ratio) +
|
---|
| 117 | (cth[NEW].num_of_bytes * cth[NEW].ratio))
|
---|
| 118 | / cth[MERGE].num_of_bytes;
|
---|
| 119 |
|
---|
| 120 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 121 | HTONUL(cth[MERGE].num_of_docs);
|
---|
| 122 | HTOND(cth[MERGE].num_of_bytes); /* [RJM 07/97: 4G limit] */
|
---|
| 123 | HTONUL(cth[MERGE].num_of_words);
|
---|
| 124 | HTONUL(cth[MERGE].length_of_longest_doc);
|
---|
| 125 | HTOND(cth[MERGE].ratio);
|
---|
| 126 |
|
---|
| 127 | fwrite (&cth[MERGE], sizeof (cth[MERGE]), 1, idx[MERGE]);
|
---|
| 128 | fseek (text[OLD], magicsize, 0);
|
---|
| 129 | fwrite (&cth[MERGE], sizeof (cth[MERGE]), 1, text[OLD]);
|
---|
| 130 |
|
---|
| 131 | /*
|
---|
| 132 | * Update *.text.idx: need to know where each new doc starts
|
---|
| 133 | * in the appended .text file
|
---|
| 134 | */
|
---|
| 135 | for (i = 0; i < cth[OLD].num_of_docs; i++)
|
---|
| 136 | {
|
---|
| 137 | fread (&data, sizeof (u_long), 1, idx[OLD]);
|
---|
| 138 | fwrite (&data, sizeof (u_long), 1, idx[MERGE]);
|
---|
| 139 | }
|
---|
| 140 |
|
---|
| 141 | /* offset is the amount to add to each entry from idx[NEW] */
|
---|
| 142 | fread (&offset, sizeof (u_long), 1, idx[OLD]);
|
---|
| 143 | NTOHUL(offset); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 144 | offset -= (4 + sizeof (cth[OLD])); /* 4 for the magic number */
|
---|
| 145 |
|
---|
| 146 | for (i = 0; i < cth[NEW].num_of_docs; i++)
|
---|
| 147 | {
|
---|
| 148 | fread (&data, sizeof (u_long), 1, idx[NEW]);
|
---|
| 149 | NTOHUL(data); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 150 | data += offset;
|
---|
| 151 | HTONUL(data); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 152 | fwrite (&data, sizeof (u_long), 1, idx[MERGE]);
|
---|
| 153 | }
|
---|
| 154 | /* write last u_long in idx[MERGE] (= length of file) */
|
---|
| 155 | fread (&data, sizeof (u_long), 1, idx[NEW]);
|
---|
| 156 | NTOHUL(data); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 157 | data += offset;
|
---|
| 158 | HTONUL(data); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 159 | fwrite (&data, sizeof (u_long), 1, idx[MERGE]);
|
---|
| 160 |
|
---|
| 161 | /******* update .text *******/
|
---|
| 162 | /* simply cat's the files together, except for the headers
|
---|
| 163 | * and magic numbers, of course
|
---|
| 164 | */
|
---|
| 165 | fseek (text[OLD], 0L, 2);
|
---|
| 166 | while (!feof (text[NEW]))
|
---|
| 167 | {
|
---|
| 168 | fread (&c, sizeof (c), 1, text[NEW]);
|
---|
| 169 | if (!feof (text[NEW]))
|
---|
| 170 | fwrite (&c, sizeof (c), 1, text[OLD]);
|
---|
| 171 | }
|
---|
| 172 |
|
---|
| 173 | return OK;
|
---|
| 174 | }
|
---|
| 175 |
|
---|
| 176 |
|
---|
| 177 |
|
---|
| 178 | /*=======================================================================
|
---|
| 179 | * done_merge_text(): close files.
|
---|
| 180 | *=======================================================================*/
|
---|
| 181 | int
|
---|
| 182 | done_merge_text (void)
|
---|
| 183 | {
|
---|
| 184 | fclose (idx[MERGE]);
|
---|
| 185 | fclose (text[OLD]);
|
---|
| 186 | fclose (idx[OLD]);
|
---|
| 187 | fclose (text[NEW]);
|
---|
| 188 | fclose (idx[NEW]);
|
---|
| 189 |
|
---|
| 190 | fprintf (stderr, "mg_text_merge: %ld documents added to %s\n",
|
---|
| 191 | cth[NEW].num_of_docs, merge_name);
|
---|
| 192 |
|
---|
| 193 | return OK;
|
---|
| 194 | }
|
---|
| 195 |
|
---|
| 196 |
|
---|
| 197 | /*=======================================================================
|
---|
| 198 | * usage()
|
---|
| 199 | *=======================================================================*/
|
---|
| 200 | void
|
---|
| 201 | usage (char *progname)
|
---|
| 202 | {
|
---|
| 203 | fprintf (stderr, "usage: %s -f collection_name\n", progname);
|
---|
| 204 | exit (1);
|
---|
| 205 | }
|
---|
| 206 |
|
---|
| 207 |
|
---|
| 208 | /*=======================================================================
|
---|
| 209 | * main()
|
---|
| 210 | *=======================================================================*/
|
---|
| 211 | int main (int argc, char *argv[])
|
---|
| 212 | {
|
---|
| 213 | char *progname;
|
---|
| 214 | ProgTime start;
|
---|
| 215 | int ch; /* for command line processing */
|
---|
| 216 |
|
---|
| 217 | progname = argv[0];
|
---|
| 218 | msg_prefix = argv[0];
|
---|
| 219 | merge_name[0] = '\0';
|
---|
| 220 |
|
---|
| 221 | while ((ch = getopt (argc, argv, "f:d:h")) != -1)
|
---|
| 222 | switch (ch)
|
---|
| 223 | {
|
---|
| 224 | case 'f':
|
---|
| 225 | strcpy (merge_name, optarg);
|
---|
| 226 | break;
|
---|
| 227 | case 'd':
|
---|
| 228 | set_basepath (optarg);
|
---|
| 229 | break;
|
---|
| 230 | case 'h':
|
---|
| 231 | case '?':
|
---|
| 232 | default:
|
---|
| 233 | usage (progname);
|
---|
| 234 | }
|
---|
| 235 |
|
---|
| 236 | if (merge_name[0] == '\0')
|
---|
| 237 | usage (progname);
|
---|
| 238 | strcpy (old_name, merge_name);
|
---|
| 239 | strcat (old_name, ".old");
|
---|
| 240 | strcpy (new_name, merge_name);
|
---|
| 241 | strcat (new_name, ".new");
|
---|
| 242 |
|
---|
| 243 | GetTime (&start);
|
---|
| 244 | init_merge_text ();
|
---|
| 245 | process_merge_text ();
|
---|
| 246 | done_merge_text ();
|
---|
| 247 | Message ("%s\n", ElapsedTime (&start, NULL));
|
---|
| 248 | return 0;
|
---|
| 249 | }
|
---|