annotate insert.cpp @ 405:ef4792df8f93 api-inversion

invert audioDB::insert / audiodb_insert(). Start off by removing audioDB::insertDatum, and essentially reusing it as audiodb_insert. We now ignore the fact that the command-line parsing code has "helpfully" opened a std::ifstream for the times file and an fd for the power file, and simply go ahead and do our own dirty work. We can delete audioDB::insertDatum entirely, but unfortunately we can't delete audioDB::insertPowerData and audioDB::insertTimestamps, because the index and query code respectively use them. Instead, move the two methods closer to their single uses. audiodb_insert() is perhaps not as short and simple as it might have been hoped given the existence of audiodb_insert_datum(); some of that is C and its terribly way of making you pay every time you use dynamic memory; some of it is the fact that the three different files (feature, times, power) each requires slightly different treatment. Hey ho. We can implement audiodb_batchinsert() in terms of audiodb_insert(); the function is pleasingly small. We can't quite use it for audioDB::batchinsert yet, as we have to deal with the O2_FLAG_LARGE_ADB case (which codepath is untested in libtests/). This means that we can delete whole swathes of hideous code from audioDB.cpp, including not just the versions of audiodb_insert() and audiodb_batchinsert() but also an entire audioDB constructor. Yay. (audioDB::unitNormAndInsertL2 has also died a deserved death).
author mas01cr
date Fri, 05 Dec 2008 22:32:49 +0000
parents 1fb8bee777e5
children c279adeb47f4
rev   line source
mas01cr@239 1 #include "audioDB.h"
mas01cr@404 2 extern "C" {
mas01cr@404 3 #include "audioDB_API.h"
mas01cr@404 4 }
mas01cr@404 5 #include "audioDB-internals.h"
mas01cr@404 6
mas01cr@404 7 static bool audiodb_enough_data_space_free(adb_t *adb, off_t size) {
mas01cr@404 8 adb_header_t *header = adb->header;
mas01cr@404 9 /* FIXME: timesTableOffset isn't necessarily the next biggest offset
mas01cr@404 10 after dataOffset. Maybe make the offsets into an array that we
mas01cr@404 11 can iterate over... */
mas01cr@404 12 return (header->timesTableOffset >
mas01cr@404 13 header->dataOffset + header->length + size);
mas01cr@404 14 }
mas01cr@404 15
mas01cr@404 16 static bool audiodb_enough_per_file_space_free(adb_t *adb) {
mas01cr@404 17 /* FIXME: the comment above about the ordering of the tables applies
mas01cr@404 18 here too. */
mas01cr@404 19 adb_header_t *header = adb->header;
mas01cr@404 20 off_t file_table_length = header->trackTableOffset - header->fileTableOffset;
mas01cr@404 21 off_t track_table_length = header->dataOffset - header->trackTableOffset;
mas01cr@404 22 int fmaxfiles = file_table_length / O2_FILETABLE_ENTRY_SIZE;
mas01cr@404 23 int tmaxfiles = track_table_length / O2_TRACKTABLE_ENTRY_SIZE;
mas01cr@404 24 /* maxfiles is the _minimum_ of the two. Do not be confused... */
mas01cr@404 25 unsigned int maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles;
mas01cr@404 26 return (header->numFiles < maxfiles);
mas01cr@404 27 }
mas01cr@404 28
mas01cr@404 29 /*
mas01cr@404 30 * Hey, look, a comment. Normally I wouldn't bother, as the code
mas01cr@404 31 * should be self-documenting, but a lot of logic is concentrated in
mas01cr@404 32 * this one place, so let's give an overview beforehand. To insert a
mas01cr@404 33 * datum into the database, we:
mas01cr@404 34 *
mas01cr@404 35 * 1. check write permission;
mas01cr@404 36 * 2. check !O2_FLAG_LARGE_ADB;
mas01cr@404 37 * 3. check for enough space;
mas01cr@404 38 * 4. check that datum->dim and adb->header->dim agree (or that the
mas01cr@404 39 * header dimension is zero, in which case write datum->dim to
mas01cr@404 40 * adb->header->dim).
mas01cr@404 41 * 5. check for presence of datum->key in adb->keys;
mas01cr@404 42 * 6. check for consistency between power and O2_FLAG_POWER, and
mas01cr@404 43 * times and O2_FLAG_TIMES;
mas01cr@404 44 * 7. write in data, power, times as appropriate; add to track
mas01cr@404 45 * and key tables too;
mas01cr@404 46 * 8. if O2_FLAG_L2NORM, compute norms and fill in table;
mas01cr@404 47 * 9. update adb->keys and adb->header;
mas01cr@404 48 * 10. sync adb->header with disk.
mas01cr@404 49 *
mas01cr@404 50 * Step 10 essentially commits the transaction; until we update
mas01cr@404 51 * header->length, nothing will recognize the newly-written data.
mas01cr@404 52 * In principle, if it fails, we should roll back, which we can in
mas01cr@404 53 * fact do on the assumption that nothing in step 9 can ever fail;
mas01cr@404 54 * on the other hand, if it's failed, then it's unlikely that
mas01cr@404 55 * rolling back by syncing the original header back to disk is going
mas01cr@404 56 * to work desperately well.
mas01cr@404 57 */
mas01cr@404 58 int audiodb_insert_datum(adb_t *adb, adb_datum_t *datum) {
mas01cr@404 59
mas01cr@404 60 off_t size, offset, nfiles;
mas01cr@404 61 double *l2norm_buffer, *lp, *dp;
mas01cr@404 62
mas01cr@404 63 /* 1. check write permission; */
mas01cr@404 64 if(!(adb->flags & O_RDWR)) {
mas01cr@404 65 return 1;
mas01cr@404 66 }
mas01cr@404 67 /* 2. check !O2_FLAG_LARGE_ADB; */
mas01cr@404 68 if(adb->header->flags & O2_FLAG_LARGE_ADB) {
mas01cr@404 69 return 1;
mas01cr@404 70 }
mas01cr@404 71 /* 3. check for enough space; */
mas01cr@404 72 size = sizeof(double) * datum->nvectors * datum->dim;
mas01cr@404 73 if(!audiodb_enough_data_space_free(adb, size)) {
mas01cr@404 74 return 1;
mas01cr@404 75 }
mas01cr@404 76 if(!audiodb_enough_per_file_space_free(adb)) {
mas01cr@404 77 return 1;
mas01cr@404 78 }
mas01cr@404 79 /* 4. check that datum->dim and adb->header->dim agree (or that the
mas01cr@404 80 * header dimension is zero, in which case write datum->dim to
mas01cr@404 81 * adb->header->dim).
mas01cr@404 82 */
mas01cr@404 83 if(adb->header->dim == 0) {
mas01cr@404 84 adb->header->dim = datum->dim;
mas01cr@404 85 } else if (adb->header->dim != datum->dim) {
mas01cr@404 86 return 1;
mas01cr@404 87 }
mas01cr@404 88 /* 5. check for presence of datum->key in adb->keys; */
mas01cr@404 89 if(adb->keys->count(datum->key)) {
mas01cr@404 90 /* not part of an explicit API/ABI, but we need a distinguished
mas01cr@404 91 value in this circumstance to preserve somewhat wonky behaviour
mas01cr@404 92 of audioDB::batchinsert. */
mas01cr@404 93 return 2;
mas01cr@404 94 }
mas01cr@404 95 /* 6. check for consistency between power and O2_FLAG_POWER, and
mas01cr@404 96 * times and O2_FLAG_TIMES;
mas01cr@404 97 */
mas01cr@404 98 if((datum->power && !(adb->header->flags & O2_FLAG_POWER)) ||
mas01cr@404 99 ((adb->header->flags & O2_FLAG_POWER) && !datum->power)) {
mas01cr@404 100 return 1;
mas01cr@404 101 }
mas01cr@404 102 if(datum->times && !(adb->header->flags & O2_FLAG_TIMES)) {
mas01cr@404 103 if(adb->header->numFiles == 0) {
mas01cr@404 104 adb->header->flags |= O2_FLAG_TIMES;
mas01cr@404 105 } else {
mas01cr@404 106 return 1;
mas01cr@404 107 }
mas01cr@404 108 } else if ((adb->header->flags & O2_FLAG_TIMES) && !datum->times) {
mas01cr@404 109 return 1;
mas01cr@404 110 }
mas01cr@404 111 /* 7. write in data, power, times as appropriate; add to track
mas01cr@404 112 * and key tables too;
mas01cr@404 113 */
mas01cr@404 114 offset = adb->header->length;
mas01cr@404 115 nfiles = adb->header->numFiles;
mas01cr@404 116
mas01cr@404 117 /* FIXME: checking for all these lseek()s and write()s */
mas01cr@404 118 lseek(adb->fd, adb->header->dataOffset + offset, SEEK_SET);
mas01cr@404 119 write(adb->fd, datum->data, sizeof(double) * datum->nvectors * datum->dim);
mas01cr@404 120 if(datum->power) {
mas01cr@404 121 lseek(adb->fd, adb->header->powerTableOffset + offset / datum->dim, SEEK_SET);
mas01cr@404 122 write(adb->fd, datum->power, sizeof(double) * datum->nvectors);
mas01cr@404 123 }
mas01cr@404 124 if(datum->times) {
mas01cr@404 125 lseek(adb->fd, adb->header->timesTableOffset + offset / datum->dim * 2, SEEK_SET);
mas01cr@404 126 write(adb->fd, datum->times, sizeof(double) * datum->nvectors * 2);
mas01cr@404 127 }
mas01cr@404 128 lseek(adb->fd, adb->header->trackTableOffset + nfiles * O2_TRACKTABLE_ENTRY_SIZE, SEEK_SET);
mas01cr@404 129 write(adb->fd, &datum->nvectors, O2_TRACKTABLE_ENTRY_SIZE);
mas01cr@404 130 lseek(adb->fd, adb->header->fileTableOffset + nfiles * O2_FILETABLE_ENTRY_SIZE, SEEK_SET);
mas01cr@404 131 write(adb->fd, datum->key, strlen(datum->key)+1);
mas01cr@404 132
mas01cr@404 133 /* 8. if O2_FLAG_L2NORM, compute norms and fill in table; */
mas01cr@404 134 l2norm_buffer = (double *) malloc(datum->nvectors * sizeof(double));
mas01cr@404 135
mas01cr@404 136 /* FIXME: shared code with audiodb_norm_existing() */
mas01cr@404 137 dp = datum->data;
mas01cr@404 138 lp = l2norm_buffer;
mas01cr@404 139 for(size_t i = 0; i < datum->nvectors; i++) {
mas01cr@404 140 *lp = 0;
mas01cr@404 141 for(unsigned int k = 0; k < datum->dim; k++) {
mas01cr@404 142 *lp += (*dp)*(*dp);
mas01cr@404 143 dp++;
mas01cr@404 144 }
mas01cr@404 145 lp++;
mas01cr@404 146 }
mas01cr@404 147 lseek(adb->fd, adb->header->l2normTableOffset + offset / datum->dim, SEEK_SET);
mas01cr@404 148 write(adb->fd, l2norm_buffer, sizeof(double) * datum->nvectors);
mas01cr@404 149 free(l2norm_buffer);
mas01cr@404 150
mas01cr@404 151 adb->keys->insert(datum->key);
mas01cr@404 152 adb->header->numFiles += 1;
mas01cr@404 153 adb->header->length += sizeof(double) * datum->nvectors * datum->dim;
mas01cr@404 154
mas01cr@404 155 return audiodb_sync_header(adb);
mas01cr@404 156
mas01cr@404 157 error:
mas01cr@404 158 return 1;
mas01cr@404 159 }
mas01cr@239 160
mas01cr@251 161 bool audioDB::enough_per_file_space_free() {
mas01cr@251 162 unsigned int fmaxfiles, tmaxfiles;
mas01cr@251 163 unsigned int maxfiles;
mas01cr@251 164
mas01cr@256 165 fmaxfiles = fileTableLength / O2_FILETABLE_ENTRY_SIZE;
mas01cr@256 166 tmaxfiles = trackTableLength / O2_TRACKTABLE_ENTRY_SIZE;
mas01cr@251 167 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles;
mas01cr@251 168 return(dbH->numFiles < maxfiles);
mas01cr@251 169 }
mas01cr@251 170
mas01cr@405 171 int audiodb_insert(adb_t *adb, adb_insert_t *insert) {
mas01cr@404 172 adb_datum_t datum;
mas01cr@405 173 int fd = 0;
mas01cr@405 174 FILE *file = NULL;
mas01cr@405 175 struct stat st;
mas01cr@404 176 off_t size;
mas01cr@404 177 int err;
mas01cr@405 178
mas01cr@405 179 datum.data = NULL;
mas01cr@405 180 datum.power = NULL;
mas01cr@405 181 datum.times = NULL;
mas01cr@405 182 if((fd = open(insert->features, O_RDONLY)) == -1) {
mas01cr@405 183 goto error;
mas01cr@370 184 }
mas01cr@405 185 if(fstat(fd, &st)) {
mas01cr@405 186 goto error;
mas01cr@404 187 }
mas01cr@404 188 read(fd, &(datum.dim), sizeof(uint32_t));
mas01cr@405 189 size = st.st_size - sizeof(uint32_t);
mas01cr@404 190 datum.nvectors = size / (sizeof(double) * datum.dim);
mas01cr@404 191 datum.data = (double *) malloc(size);
mas01cr@404 192 if(!datum.data) {
mas01cr@405 193 goto error;
mas01cr@404 194 }
mas01cr@404 195 read(fd, datum.data, size);
mas01cr@404 196 close(fd);
mas01cr@405 197 fd = 0;
mas01cr@405 198 if(insert->power) {
mas01cr@405 199 int dim;
mas01cr@405 200 if((fd = open(insert->power, O_RDONLY)) == -1) {
mas01cr@405 201 goto error;
mas01cr@405 202 }
mas01cr@405 203 if(fstat(fd, &st)) {
mas01cr@405 204 goto error;
mas01cr@405 205 }
mas01cr@405 206 if((st.st_size - sizeof(uint32_t)) != (size / datum.dim)) {
mas01cr@405 207 goto error;
mas01cr@405 208 }
mas01cr@405 209 read(fd, &dim, sizeof(uint32_t));
mas01cr@405 210 if(dim != 1) {
mas01cr@405 211 goto error;
mas01cr@405 212 }
mas01cr@405 213 datum.power = (double *) malloc(size / datum.dim);
mas01cr@405 214 if(!datum.power) {
mas01cr@405 215 goto error;
mas01cr@405 216 }
mas01cr@405 217 read(fd, datum.power, size / datum.dim);
mas01cr@405 218 close(fd);
mas01cr@405 219 }
mas01cr@405 220 if(insert->times) {
mas01cr@405 221 double t, *tp;
mas01cr@405 222 if(!(file = fopen(insert->times, "r"))) {
mas01cr@405 223 goto error;
mas01cr@405 224 }
mas01cr@405 225 datum.times = (double *) malloc(2 * size / datum.dim);
mas01cr@404 226 if(!datum.times) {
mas01cr@405 227 goto error;
mas01cr@404 228 }
mas01cr@405 229 if(fscanf(file, " %lf", &t) != 1) {
mas01cr@405 230 goto error;
mas01cr@405 231 }
mas01cr@405 232 tp = datum.times;
mas01cr@405 233 *tp++ = t;
mas01cr@405 234 for(unsigned int n = 0; n < datum.nvectors - 1; n++) {
mas01cr@405 235 if(fscanf(file, " %lf", &t) != 1) {
mas01cr@405 236 goto error;
mas01cr@405 237 }
mas01cr@405 238 *tp++ = t;
mas01cr@405 239 *tp++ = t;
mas01cr@405 240 }
mas01cr@405 241 if(fscanf(file, " %lf", &t) != 1) {
mas01cr@405 242 goto error;
mas01cr@405 243 }
mas01cr@405 244 *tp = t;
mas01cr@405 245 fclose(file);
mas01cr@404 246 }
mas01cr@405 247 datum.key = insert->key ? insert->key : insert->features;
mas01cr@405 248 err = audiodb_insert_datum(adb, &datum);
mas01cr@405 249 free(datum.data);
mas01cr@405 250 if(datum.power) {
mas01cr@405 251 free(datum.power);
mas01cr@405 252 }
mas01cr@405 253 if(datum.times) {
mas01cr@405 254 free(datum.times);
mas01cr@405 255 }
mas01cr@405 256 if(err == 2) {
mas01cr@405 257 return 0;
mas01cr@405 258 }
mas01cr@405 259 else {
mas01cr@405 260 return err;
mas01cr@405 261 }
mas01cr@405 262
mas01cr@405 263 error:
mas01cr@405 264 if(fd > 0) {
mas01cr@405 265 close(fd);
mas01cr@405 266 }
mas01cr@405 267 if(file) {
mas01cr@405 268 fclose(file);
mas01cr@405 269 }
mas01cr@405 270 if(datum.data) {
mas01cr@405 271 free(datum.data);
mas01cr@405 272 }
mas01cr@405 273 if(datum.power) {
mas01cr@405 274 free(datum.power);
mas01cr@405 275 }
mas01cr@405 276 if(datum.times) {
mas01cr@405 277 free(datum.times);
mas01cr@405 278 }
mas01cr@405 279 return 1;
mas01cr@405 280 }
mas01cr@405 281
mas01cr@405 282 int audiodb_batchinsert(adb_t *adb, adb_insert_t *insert, unsigned int size) {
mas01cr@405 283 int err;
mas01cr@405 284 for(unsigned int n = 0; n < size; n++) {
mas01cr@405 285 if((err = audiodb_insert(adb, &(insert[n])))) {
mas01cr@405 286 return err;
mas01cr@404 287 }
mas01cr@404 288 }
mas01cr@405 289 return 0;
mas01cr@239 290 }
mas01cr@239 291
mas01cr@239 292 void audioDB::insert(const char* dbName, const char* inFile) {
mas01cr@404 293 if(!adb) {
mas01cr@404 294 if(!(adb = audiodb_open(dbName, O_RDWR))) {
mas01cr@404 295 error("failed to open database", dbName);
mas01cr@404 296 }
mas01cr@251 297 }
mas01cr@404 298 if(adb->header->flags & O2_FLAG_LARGE_ADB) {
mas01cr@404 299
mas01cr@404 300 } else {
mas01cr@404 301 /* at this point, we have powerfd (an fd), timesFile (a
mas01cr@405 302 * std::ifstream *) and inFile (a char *). Wacky, huh? Ignore
mas01cr@405 303 * the wackiness and just use the names. */
mas01cr@405 304 adb_insert_t insert;
mas01cr@405 305 insert.features = inFile;
mas01cr@405 306 insert.times = timesFileName;
mas01cr@405 307 insert.power = powerFileName;
mas01cr@405 308 insert.key = key;
mas01cr@405 309 if(audiodb_insert(adb, &insert)) {
mas01cr@405 310 error("insertion failure", inFile);
mas01cr@405 311 }
mas01cr@239 312 }
mas01cr@239 313 status(dbName);
mas01cr@239 314 }
mas01cr@239 315
mas01cr@239 316 void audioDB::batchinsert(const char* dbName, const char* inFile) {
mas01cr@239 317 forWrite = true;
mas01cr@239 318 initDBHeader(dbName);
mas01cr@239 319
mas01mc@324 320 // Treat large ADB instances differently
mas01mc@324 321 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 322 batchinsert_large_adb(dbName, inFile) ;
mas01mc@324 323 return;
mas01mc@324 324 }
mas01mc@324 325
mas01cr@239 326 if(!key)
mas01cr@239 327 key=inFile;
mas01cr@239 328 std::ifstream *filesIn = 0;
mas01cr@239 329 std::ifstream *keysIn = 0;
mas01cr@239 330
mas01cr@239 331 if(!(filesIn = new std::ifstream(inFile)))
mas01cr@239 332 error("Could not open batch in file", inFile);
mas01cr@239 333 if(key && key!=inFile)
mas01cr@239 334 if(!(keysIn = new std::ifstream(key)))
mas01cr@239 335 error("Could not open batch key file",key);
mas01cr@239 336
mas01cr@239 337 unsigned totalVectors=0;
mas01cr@239 338 char *thisFile = new char[MAXSTR];
mas01cr@262 339 char *thisKey = 0;
mas01cr@262 340 if (key && (key != inFile)) {
mas01cr@262 341 thisKey = new char[MAXSTR];
mas01cr@262 342 }
mas01cr@239 343 char *thisTimesFileName = new char[MAXSTR];
mas01cr@239 344 char *thisPowerFileName = new char[MAXSTR];
mas01cr@302 345
mas01cr@302 346 do {
mas01cr@239 347 filesIn->getline(thisFile,MAXSTR);
mas01cr@262 348 if(key && key!=inFile) {
mas01cr@239 349 keysIn->getline(thisKey,MAXSTR);
mas01cr@262 350 } else {
mas01cr@239 351 thisKey = thisFile;
mas01cr@262 352 }
mas01cr@262 353 if(usingTimes) {
mas01cr@262 354 timesFile->getline(thisTimesFileName,MAXSTR);
mas01cr@262 355 }
mas01cr@262 356 if(usingPower) {
mas01cr@239 357 powerFile->getline(thisPowerFileName, MAXSTR);
mas01cr@262 358 }
mas01cr@239 359
mas01cr@262 360 if(filesIn->eof()) {
mas01cr@239 361 break;
mas01cr@262 362 }
mas01cr@404 363 if(usingTimes){
mas01cr@404 364 if(timesFile->eof()) {
mas01cr@404 365 error("not enough timestamp files in timesList", timesFileName);
mas01cr@404 366 }
mas01cr@251 367 }
mas01cr@404 368 if (usingPower) {
mas01cr@404 369 if(powerFile->eof()) {
mas01cr@404 370 error("not enough power files in powerList", powerFileName);
mas01cr@239 371 }
mas01cr@239 372 }
mas01cr@405 373 adb_insert_t insert;
mas01cr@405 374 insert.features = thisFile;
mas01cr@405 375 insert.times = usingTimes ? thisTimesFileName : NULL;
mas01cr@405 376 insert.power = usingPower ? thisPowerFileName : NULL;
mas01cr@405 377 insert.key = thisKey;
mas01cr@405 378 if(audiodb_insert(adb, &insert)) {
mas01cr@405 379 error("insertion failure", thisFile);
mas01cr@405 380 }
mas01cr@239 381 } while(!filesIn->eof());
mas01cr@239 382
mas01cr@239 383 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
mas01cr@262 384
mas01cr@262 385 delete [] thisPowerFileName;
mas01cr@262 386 if(key && (key != inFile)) {
mas01cr@262 387 delete [] thisKey;
mas01cr@262 388 }
mas01cr@262 389 delete [] thisFile;
mas01cr@262 390 delete [] thisTimesFileName;
mas01cr@239 391
mas01cr@262 392 delete filesIn;
mas01cr@262 393 delete keysIn;
mas01cr@262 394
mas01cr@239 395 // Report status
mas01cr@239 396 status(dbName);
mas01cr@239 397 }
mas01mc@324 398
mas01mc@324 399
mas01mc@324 400 // BATCHINSERT_LARGE_ADB
mas01mc@324 401 //
mas01mc@324 402 // This method inserts file pointers into the ADB instance rather than the actual feature data
mas01mc@324 403 //
mas01mc@324 404 // This method is intended for databases that are large enough to only support indexed query
mas01mc@324 405 // So exhaustive searching across all feature vectors will not be performed
mas01mc@324 406 //
mas01mc@324 407 // We insert featureFileName, [powerFileName], [timesFileName]
mas01mc@324 408 //
mas01mc@324 409 // l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time
mas01mc@324 410 //
mas01mc@324 411 // LIMITS:
mas01mc@324 412 //
mas01mc@324 413 // We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles
mas01mc@324 414 //
mas01mc@324 415 void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) {
mas01mc@324 416
mas01mc@324 417 if(!key)
mas01mc@324 418 key=inFile;
mas01mc@324 419 std::ifstream *filesIn = 0;
mas01mc@324 420 std::ifstream *keysIn = 0;
mas01mc@324 421 std::ifstream* thisTimesFile = 0;
mas01mc@324 422 int thispowerfd = 0;
mas01mc@324 423
mas01mc@324 424 if(!(filesIn = new std::ifstream(inFile)))
mas01mc@324 425 error("Could not open batch in file", inFile);
mas01mc@324 426 if(key && key!=inFile)
mas01mc@324 427 if(!(keysIn = new std::ifstream(key)))
mas01mc@324 428 error("Could not open batch key file",key);
mas01mc@324 429
mas01mc@324 430 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01mc@324 431 error("Must use timestamps with timestamped database","use --times");
mas01mc@324 432
mas01mc@324 433 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01mc@324 434 error("Must use power with power-enabled database", dbName);
mas01mc@324 435
mas01cr@380 436 char *cwd = new char[PATH_MAX];
mas01cr@380 437
mas01cr@380 438 if ((getcwd(cwd, PATH_MAX)) == 0) {
mas01cr@380 439 error("error getting working directory", "", "getcwd");
mas01cr@380 440 }
mas01cr@380 441
mas01mc@324 442 unsigned totalVectors=0;
mas01mc@324 443 char *thisFile = new char[MAXSTR];
mas01mc@324 444 char *thisKey = 0;
mas01mc@324 445 if (key && (key != inFile)) {
mas01mc@324 446 thisKey = new char[MAXSTR];
mas01mc@324 447 }
mas01mc@324 448 char *thisTimesFileName = new char[MAXSTR];
mas01mc@324 449 char *thisPowerFileName = new char[MAXSTR];
mas01mc@324 450
mas01mc@324 451 std::set<std::string> s;
mas01mc@324 452
mas01mc@324 453 for (unsigned k = 0; k < dbH->numFiles; k++) {
mas01mc@324 454 s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
mas01mc@324 455 }
mas01mc@324 456
mas01mc@324 457 do {
mas01mc@324 458 filesIn->getline(thisFile,MAXSTR);
mas01mc@324 459 if(key && key!=inFile) {
mas01mc@324 460 keysIn->getline(thisKey,MAXSTR);
mas01mc@324 461 } else {
mas01mc@324 462 thisKey = thisFile;
mas01mc@324 463 }
mas01mc@324 464 if(usingTimes) {
mas01mc@324 465 timesFile->getline(thisTimesFileName,MAXSTR);
mas01mc@324 466 }
mas01mc@324 467 if(usingPower) {
mas01mc@324 468 powerFile->getline(thisPowerFileName, MAXSTR);
mas01mc@324 469 }
mas01mc@324 470
mas01mc@324 471 if(filesIn->eof()) {
mas01mc@324 472 break;
mas01mc@324 473 }
mas01mc@324 474
mas01mc@324 475 initInputFile(thisFile, false);
mas01mc@324 476
mas01mc@324 477 if(!enough_per_file_space_free()) {
mas01mc@324 478 error("batchinsert failed: no more room for metadata", thisFile);
mas01mc@324 479 }
mas01mc@324 480
mas01mc@324 481 if(s.count(thisKey)) {
mas01mc@324 482 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
mas01mc@324 483 } else {
mas01mc@324 484 s.insert(thisKey);
mas01mc@324 485 // Make a track index table of features to file indexes
mas01mc@324 486 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01mc@324 487 if(!numVectors) {
mas01mc@324 488 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
mas01mc@324 489 }
mas01mc@324 490 else{
mas01mc@324 491 // Check that time-stamp file exists
mas01mc@324 492 if(usingTimes){
mas01mc@324 493 if(timesFile->eof()) {
mas01mc@324 494 error("not enough timestamp files in timesList", timesFileName);
mas01mc@324 495 }
mas01mc@324 496 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
mas01mc@324 497 if(!thisTimesFile->is_open()) {
mas01mc@324 498 error("Cannot open timestamp file", thisTimesFileName);
mas01mc@324 499 }
mas01mc@324 500 if(thisTimesFile)
mas01mc@324 501 delete thisTimesFile;
mas01mc@324 502 }
mas01mc@324 503
mas01mc@324 504 // Check that power file exists
mas01mc@324 505 if (usingPower) {
mas01mc@324 506 if(powerFile->eof()) {
mas01mc@324 507 error("not enough power files in powerList", powerFileName);
mas01mc@324 508 }
mas01mc@324 509 thispowerfd = open(thisPowerFileName, O_RDONLY);
mas01mc@324 510 if (thispowerfd < 0) {
mas01mc@324 511 error("failed to open power file", thisPowerFileName);
mas01mc@324 512 }
mas01mc@324 513 if (0 < thispowerfd) {
mas01mc@324 514 close(thispowerfd);
mas01mc@324 515 }
mas01mc@324 516 }
mas01mc@324 517
mas01mc@324 518 // persist links to the feature files for reading from filesystem later
mas01mc@324 519
mas01mc@324 520 // Primary Keys
mas01mc@324 521 INSERT_FILETABLE_STRING(fileTable, thisKey);
mas01cr@380 522
mas01cr@380 523 if(*thisFile != '/') {
mas01cr@380 524 /* FIXME: MAXSTR and O2_FILETABLE_ENTRY_SIZE should probably
mas01cr@380 525 be the same thing. Also, both are related to PATH_MAX,
mas01cr@380 526 which admittedly is not always defined or a
mas01cr@380 527 constant... */
mas01cr@380 528 char tmp[MAXSTR];
mas01cr@380 529 strncpy(tmp, thisFile, MAXSTR);
mas01cr@380 530 snprintf(thisFile, MAXSTR, "%s/%s", cwd, tmp);
mas01cr@380 531 }
mas01mc@324 532 // Feature Vector fileNames
mas01mc@324 533 INSERT_FILETABLE_STRING(featureFileNameTable, thisFile);
mas01mc@324 534
mas01mc@324 535 // Time Stamp fileNames
mas01cr@380 536 if(usingTimes) {
mas01cr@380 537 if(*thisTimesFileName != '/') {
mas01cr@380 538 char tmp[MAXSTR];
mas01cr@380 539 strncpy(tmp, thisTimesFileName, MAXSTR);
mas01cr@380 540 snprintf(thisTimesFileName, MAXSTR, "%s/%s", cwd, tmp);
mas01cr@380 541 }
mas01mc@324 542 INSERT_FILETABLE_STRING(timesFileNameTable, thisTimesFileName);
mas01cr@380 543 }
mas01mc@324 544
mas01mc@324 545 // Power fileNames
mas01cr@380 546 if(usingPower) {
mas01cr@380 547 if(*thisPowerFileName != '/') {
mas01cr@380 548 char tmp[MAXSTR];
mas01cr@380 549 strncpy(tmp, thisPowerFileName, MAXSTR);
mas01cr@380 550 snprintf(thisPowerFileName, MAXSTR, "%s/%s", cwd, tmp);
mas01cr@380 551 }
mas01mc@324 552 INSERT_FILETABLE_STRING(powerFileNameTable, thisPowerFileName);
mas01cr@380 553 }
mas01mc@324 554
mas01mc@324 555 // Increment file count
mas01mc@324 556 dbH->numFiles++;
mas01mc@324 557
mas01mc@324 558 // Update Header information
mas01mc@324 559 dbH->length+=(statbuf.st_size-sizeof(int));
mas01mc@324 560
mas01mc@324 561 // Update track to file index map
mas01mc@324 562 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
mas01mc@324 563
mas01mc@324 564 totalVectors+=numVectors;
mas01mc@324 565
mas01mc@324 566 // Copy the header back to the database
mas01mc@324 567 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01mc@324 568 }
mas01mc@324 569 }
mas01mc@324 570 // CLEAN UP
mas01mc@324 571 if(indata)
mas01mc@324 572 munmap(indata,statbuf.st_size);
mas01mc@324 573 if(infid>0)
mas01mc@324 574 close(infid);
mas01mc@324 575 } while(!filesIn->eof());
mas01mc@324 576
mas01mc@324 577 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
mas01mc@324 578
mas01mc@324 579 delete [] thisPowerFileName;
mas01mc@324 580 if(key && (key != inFile)) {
mas01mc@324 581 delete [] thisKey;
mas01mc@324 582 }
mas01mc@324 583 delete [] thisFile;
mas01mc@324 584 delete [] thisTimesFileName;
mas01mc@324 585
mas01mc@324 586 delete filesIn;
mas01mc@324 587 delete keysIn;
mas01mc@324 588
mas01mc@324 589 // Report status
mas01mc@324 590 status(dbName);
mas01mc@324 591 }