annotate insert.cpp @ 251:a6c9a1c68646

Detect when we've run out of per-file space on insert() and batchinsert(). Exposed by Lute Music/frames1 dataset, we previously corrupted the trackTable and then got a segfault. This happened because the fileTable and trackTable were mmap()ed next to each other, by coincidence, and the lack of overflow checking on the fileTable meant that continued insertion scribbled over the trackTable, which was twice as big (because it has to be at least one memory page in size). The root cause of all this is the --size creation argument, which needs to be split into --nfiles, --datasize and --dimensions, so that the size of all the tables can be computed accurately. No test case yet, because my /bin/sh is currently pointing to dash, which gets about as far as line 6 of run-tests.sh before giving up. (We need either to fix bashisms or to run /bin/bash explicitly.)
author mas01cr
date Mon, 31 Mar 2008 11:52:59 +0000
parents 2cc06e5b05a5
children 4dcb09f5fe85
rev   line source
mas01cr@239 1 #include "audioDB.h"
mas01cr@239 2
mas01cr@251 3 bool audioDB::enough_per_file_space_free() {
mas01cr@251 4 unsigned int fmaxfiles, tmaxfiles;
mas01cr@251 5 unsigned int maxfiles;
mas01cr@251 6
mas01cr@251 7 fmaxfiles = fileTableLength / O2_FILETABLESIZE;
mas01cr@251 8 tmaxfiles = trackTableLength / O2_TRACKTABLESIZE;
mas01cr@251 9 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles;
mas01cr@251 10 return(dbH->numFiles < maxfiles);
mas01cr@251 11 }
mas01cr@251 12
mas01cr@239 13 bool audioDB::enough_data_space_free(off_t size) {
mas01cr@239 14 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
mas01cr@239 15 }
mas01cr@239 16
mas01cr@239 17 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
mas01cr@239 18 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET);
mas01cr@239 19 write(dbfid, buffer, size);
mas01cr@239 20 }
mas01cr@239 21
mas01cr@239 22 void audioDB::insert(const char* dbName, const char* inFile) {
mas01cr@239 23 forWrite = true;
mas01cr@239 24 initTables(dbName, inFile);
mas01cr@239 25
mas01cr@239 26 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01cr@239 27 error("Must use timestamps with timestamped database","use --times");
mas01cr@239 28
mas01cr@239 29 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01cr@239 30 error("Must use power with power-enabled database", dbName);
mas01cr@239 31
mas01cr@251 32 if(!enough_per_file_space_free()) {
mas01cr@251 33 error("Insert failed: no more room for metadata", inFile);
mas01cr@251 34 }
mas01cr@251 35
mas01cr@239 36 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
mas01cr@239 37 error("Insert failed: no more room in database", inFile);
mas01cr@239 38 }
mas01cr@239 39
mas01cr@239 40 if(!key)
mas01cr@239 41 key=inFile;
mas01cr@239 42 // Linear scan of filenames check for pre-existing feature
mas01cr@239 43 unsigned alreadyInserted=0;
mas01cr@239 44 for(unsigned k=0; k<dbH->numFiles; k++)
mas01cr@239 45 if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key)+1)==0){
mas01cr@239 46 alreadyInserted=1;
mas01cr@239 47 break;
mas01cr@239 48 }
mas01cr@239 49
mas01cr@239 50 if(alreadyInserted) {
mas01cr@239 51 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
mas01cr@239 52 return;
mas01cr@239 53 }
mas01cr@239 54
mas01cr@239 55 // Make a track index table of features to file indexes
mas01cr@239 56 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01cr@239 57 if(!numVectors) {
mas01cr@239 58 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key);
mas01cr@239 59
mas01cr@239 60 // CLEAN UP
mas01cr@239 61 munmap(indata,statbuf.st_size);
mas01cr@239 62 munmap(db,dbH->dbSize);
mas01cr@239 63 close(infid);
mas01cr@239 64 return;
mas01cr@239 65 }
mas01cr@239 66
mas01cr@239 67 strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key));
mas01cr@239 68
mas01cr@239 69 off_t insertoffset = dbH->length;// Store current state
mas01cr@239 70
mas01cr@239 71 // Check times status and insert times from file
mas01cr@239 72 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
mas01cr@239 73 double *timesdata = timesTable + 2*indexoffset;
mas01cr@239 74
mas01cr@239 75 if(2*(indexoffset + numVectors) > timesTableLength) {
mas01cr@239 76 error("out of space for times", key);
mas01cr@239 77 }
mas01cr@239 78
mas01cr@239 79 if (usingTimes) {
mas01cr@239 80 insertTimeStamps(numVectors, timesFile, timesdata);
mas01cr@239 81 }
mas01cr@239 82
mas01cr@239 83 double *powerdata = powerTable + indexoffset;
mas01cr@239 84 insertPowerData(numVectors, powerfd, powerdata);
mas01cr@239 85
mas01cr@239 86 // Increment file count
mas01cr@239 87 dbH->numFiles++;
mas01cr@239 88
mas01cr@239 89 // Update Header information
mas01cr@239 90 dbH->length+=(statbuf.st_size-sizeof(int));
mas01cr@239 91
mas01cr@239 92 // Update track to file index map
mas01cr@239 93 memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned));
mas01cr@239 94
mas01cr@239 95 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
mas01cr@239 96
mas01cr@239 97 // Norm the vectors on input if the database is already L2 normed
mas01cr@239 98 if(dbH->flags & O2_FLAG_L2NORM)
mas01cr@239 99 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
mas01cr@239 100
mas01cr@239 101 // Report status
mas01cr@239 102 status(dbName);
mas01cr@239 103 VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int)));
mas01cr@239 104
mas01cr@239 105 // Copy the header back to the database
mas01cr@239 106 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01cr@239 107
mas01cr@239 108 // CLEAN UP
mas01cr@239 109 munmap(indata,statbuf.st_size);
mas01cr@239 110 close(infid);
mas01cr@239 111 }
mas01cr@239 112
mas01cr@239 113 void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) {
mas01cr@239 114 assert(usingTimes);
mas01cr@239 115
mas01cr@239 116 unsigned numtimes = 0;
mas01cr@239 117
mas01cr@239 118 if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) {
mas01cr@239 119 dbH->flags=dbH->flags|O2_FLAG_TIMES;
mas01cr@239 120 } else if(!(dbH->flags & O2_FLAG_TIMES)) {
mas01cr@239 121 error("Timestamp file used with non-timestamped database", timesFileName);
mas01cr@239 122 }
mas01cr@239 123
mas01cr@239 124 if(!timesFile->is_open()) {
mas01cr@239 125 error("problem opening times file on timestamped database", timesFileName);
mas01cr@239 126 }
mas01cr@239 127
mas01cr@239 128 double timepoint, next;
mas01cr@239 129 *timesFile >> timepoint;
mas01cr@239 130 if (timesFile->eof()) {
mas01cr@239 131 error("no entries in times file", timesFileName);
mas01cr@239 132 }
mas01cr@239 133 numtimes++;
mas01cr@239 134 do {
mas01cr@239 135 *timesFile >> next;
mas01cr@239 136 if (timesFile->eof()) {
mas01cr@239 137 break;
mas01cr@239 138 }
mas01cr@239 139 numtimes++;
mas01cr@239 140 timesdata[0] = timepoint;
mas01cr@239 141 timepoint = (timesdata[1] = next);
mas01cr@239 142 timesdata += 2;
mas01cr@239 143 } while (numtimes < numVectors + 1);
mas01cr@239 144
mas01cr@239 145 if (numtimes < numVectors + 1) {
mas01cr@239 146 error("too few timepoints in times file", timesFileName);
mas01cr@239 147 }
mas01cr@239 148
mas01cr@239 149 *timesFile >> next;
mas01cr@239 150 if (!timesFile->eof()) {
mas01cr@239 151 error("too many timepoints in times file", timesFileName);
mas01cr@239 152 }
mas01cr@239 153 }
mas01cr@239 154
mas01cr@239 155 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@239 156 if (usingPower) {
mas01cr@239 157 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01cr@239 158 error("Cannot insert power data on non-power DB", dbName);
mas01cr@239 159 }
mas01cr@239 160
mas01cr@239 161 int one;
mas01cr@239 162 unsigned int count;
mas01cr@239 163
mas01cr@239 164 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@239 165 if (count != sizeof(unsigned int)) {
mas01cr@239 166 error("powerfd read failed", "int", "read");
mas01cr@239 167 }
mas01cr@239 168 if (one != 1) {
mas01cr@239 169 error("dimensionality of power file not 1", powerFileName);
mas01cr@239 170 }
mas01cr@239 171
mas01cr@239 172 // FIXME: should check that the powerfile is the right size for
mas01cr@239 173 // this. -- CSR, 2007-10-30
mas01cr@239 174 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@239 175 if (count != numVectors * sizeof(double)) {
mas01cr@239 176 error("powerfd read failed", "double", "read");
mas01cr@239 177 }
mas01cr@239 178 }
mas01cr@239 179 }
mas01cr@239 180
mas01cr@239 181 void audioDB::batchinsert(const char* dbName, const char* inFile) {
mas01cr@239 182
mas01cr@239 183 forWrite = true;
mas01cr@239 184 initDBHeader(dbName);
mas01cr@239 185
mas01cr@239 186 if(!key)
mas01cr@239 187 key=inFile;
mas01cr@239 188 std::ifstream *filesIn = 0;
mas01cr@239 189 std::ifstream *keysIn = 0;
mas01cr@239 190 std::ifstream* thisTimesFile = 0;
mas01cr@239 191 int thispowerfd = 0;
mas01cr@239 192
mas01cr@239 193 if(!(filesIn = new std::ifstream(inFile)))
mas01cr@239 194 error("Could not open batch in file", inFile);
mas01cr@239 195 if(key && key!=inFile)
mas01cr@239 196 if(!(keysIn = new std::ifstream(key)))
mas01cr@239 197 error("Could not open batch key file",key);
mas01cr@239 198
mas01cr@239 199 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01cr@239 200 error("Must use timestamps with timestamped database","use --times");
mas01cr@239 201
mas01cr@239 202 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01cr@239 203 error("Must use power with power-enabled database", dbName);
mas01cr@239 204
mas01cr@239 205 unsigned totalVectors=0;
mas01cr@239 206 char *thisKey = new char[MAXSTR];
mas01cr@239 207 char *thisFile = new char[MAXSTR];
mas01cr@239 208 char *thisTimesFileName = new char[MAXSTR];
mas01cr@239 209 char *thisPowerFileName = new char[MAXSTR];
mas01cr@239 210
mas01cr@239 211 do{
mas01cr@239 212 filesIn->getline(thisFile,MAXSTR);
mas01cr@239 213 if(key && key!=inFile)
mas01cr@239 214 keysIn->getline(thisKey,MAXSTR);
mas01cr@239 215 else
mas01cr@239 216 thisKey = thisFile;
mas01cr@239 217 if(usingTimes)
mas01cr@239 218 timesFile->getline(thisTimesFileName,MAXSTR);
mas01cr@239 219 if(usingPower)
mas01cr@239 220 powerFile->getline(thisPowerFileName, MAXSTR);
mas01cr@239 221
mas01cr@239 222 if(filesIn->eof())
mas01cr@239 223 break;
mas01cr@239 224
mas01cr@239 225 initInputFile(thisFile);
mas01cr@239 226
mas01cr@251 227 if(!enough_per_file_space_free()) {
mas01cr@251 228 error("batchinsert failed: no more room for metadata", thisFile);
mas01cr@251 229 }
mas01cr@251 230
mas01cr@239 231 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
mas01cr@239 232 error("batchinsert failed: no more room in database", thisFile);
mas01cr@239 233 }
mas01cr@239 234
mas01cr@239 235 // Linear scan of filenames check for pre-existing feature
mas01cr@239 236 unsigned alreadyInserted=0;
mas01cr@239 237
mas01cr@239 238 for(unsigned k=0; k<dbH->numFiles; k++)
mas01cr@239 239 if(strncmp(fileTable + k*O2_FILETABLESIZE, thisKey, strlen(thisKey)+1)==0){
mas01cr@239 240 alreadyInserted=1;
mas01cr@239 241 break;
mas01cr@239 242 }
mas01cr@239 243
mas01cr@239 244 if(alreadyInserted) {
mas01cr@239 245 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
mas01cr@239 246 } else {
mas01cr@239 247 // Make a track index table of features to file indexes
mas01cr@239 248 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01cr@239 249 if(!numVectors) {
mas01cr@239 250 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
mas01cr@239 251 }
mas01cr@239 252 else{
mas01cr@239 253 if(usingTimes){
mas01cr@239 254 if(timesFile->eof()) {
mas01cr@239 255 error("not enough timestamp files in timesList", timesFileName);
mas01cr@239 256 }
mas01cr@239 257 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
mas01cr@239 258 if(!thisTimesFile->is_open()) {
mas01cr@239 259 error("Cannot open timestamp file", thisTimesFileName);
mas01cr@239 260 }
mas01cr@239 261 off_t insertoffset = dbH->length;
mas01cr@239 262 unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double));
mas01cr@239 263 double *timesdata = timesTable + 2*indexoffset;
mas01cr@239 264 if(2*(indexoffset + numVectors) > timesTableLength) {
mas01cr@239 265 error("out of space for times", key);
mas01cr@239 266 }
mas01cr@239 267 insertTimeStamps(numVectors, thisTimesFile, timesdata);
mas01cr@239 268 if(thisTimesFile)
mas01cr@239 269 delete thisTimesFile;
mas01cr@239 270 }
mas01cr@239 271
mas01cr@239 272 if (usingPower) {
mas01cr@239 273 if(powerFile->eof()) {
mas01cr@239 274 error("not enough power files in powerList", powerFileName);
mas01cr@239 275 }
mas01cr@239 276 thispowerfd = open(thisPowerFileName, O_RDONLY);
mas01cr@239 277 if (thispowerfd < 0) {
mas01cr@239 278 error("failed to open power file", thisPowerFileName);
mas01cr@239 279 }
mas01cr@239 280 off_t insertoffset = dbH->length;
mas01cr@239 281 unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double));
mas01cr@239 282 double *powerdata = powerTable + poweroffset;
mas01cr@239 283 insertPowerData(numVectors, thispowerfd, powerdata);
mas01cr@239 284 if (0 < thispowerfd) {
mas01cr@239 285 close(thispowerfd);
mas01cr@239 286 }
mas01cr@239 287 }
mas01cr@239 288 strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey));
mas01cr@239 289
mas01cr@239 290 off_t insertoffset = dbH->length;// Store current state
mas01cr@239 291
mas01cr@239 292 // Increment file count
mas01cr@239 293 dbH->numFiles++;
mas01cr@239 294
mas01cr@239 295 // Update Header information
mas01cr@239 296 dbH->length+=(statbuf.st_size-sizeof(int));
mas01cr@239 297
mas01cr@239 298 // Update track to file index map
mas01cr@239 299 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
mas01cr@239 300
mas01cr@239 301 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
mas01cr@239 302
mas01cr@239 303 // Norm the vectors on input if the database is already L2 normed
mas01cr@239 304 if(dbH->flags & O2_FLAG_L2NORM)
mas01cr@239 305 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
mas01cr@239 306
mas01cr@239 307 totalVectors+=numVectors;
mas01cr@239 308
mas01cr@239 309 // Copy the header back to the database
mas01cr@239 310 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01cr@239 311 }
mas01cr@239 312 }
mas01cr@239 313 // CLEAN UP
mas01cr@239 314 munmap(indata,statbuf.st_size);
mas01cr@239 315 close(infid);
mas01cr@239 316 } while(!filesIn->eof());
mas01cr@239 317
mas01cr@239 318 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
mas01cr@239 319
mas01cr@239 320 // Report status
mas01cr@239 321 status(dbName);
mas01cr@239 322 }