annotate insert.cpp @ 279:dee55886eca0 sampling

make the RNG a part of the audioDB object. Easier to deal with memory discipline and initialization (though note the FIXME comment in audioDB::initTables()). Also initialize the RNG from the current time. A mature implementation would use a proper source of entropy...
author mas01cr
date Wed, 02 Jul 2008 13:53:23 +0000
parents 34ce7f7a177d
children 74824093c1c4
rev   line source
mas01cr@239 1 #include "audioDB.h"
mas01cr@239 2
mas01cr@251 3 bool audioDB::enough_per_file_space_free() {
mas01cr@251 4 unsigned int fmaxfiles, tmaxfiles;
mas01cr@251 5 unsigned int maxfiles;
mas01cr@251 6
mas01cr@256 7 fmaxfiles = fileTableLength / O2_FILETABLE_ENTRY_SIZE;
mas01cr@256 8 tmaxfiles = trackTableLength / O2_TRACKTABLE_ENTRY_SIZE;
mas01cr@251 9 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles;
mas01cr@251 10 return(dbH->numFiles < maxfiles);
mas01cr@251 11 }
mas01cr@251 12
mas01cr@239 13 bool audioDB::enough_data_space_free(off_t size) {
mas01cr@239 14 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
mas01cr@239 15 }
mas01cr@239 16
mas01cr@239 17 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
mas01cr@239 18 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET);
mas01cr@239 19 write(dbfid, buffer, size);
mas01cr@239 20 }
mas01cr@239 21
mas01cr@239 22 void audioDB::insert(const char* dbName, const char* inFile) {
mas01cr@239 23 forWrite = true;
mas01cr@239 24 initTables(dbName, inFile);
mas01cr@239 25
mas01cr@239 26 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01cr@239 27 error("Must use timestamps with timestamped database","use --times");
mas01cr@239 28
mas01cr@239 29 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01cr@239 30 error("Must use power with power-enabled database", dbName);
mas01cr@239 31
mas01cr@251 32 if(!enough_per_file_space_free()) {
mas01cr@251 33 error("Insert failed: no more room for metadata", inFile);
mas01cr@251 34 }
mas01cr@251 35
mas01cr@239 36 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
mas01cr@239 37 error("Insert failed: no more room in database", inFile);
mas01cr@239 38 }
mas01cr@239 39
mas01cr@239 40 if(!key)
mas01cr@239 41 key=inFile;
mas01cr@239 42 // Linear scan of filenames check for pre-existing feature
mas01cr@239 43 unsigned alreadyInserted=0;
mas01cr@239 44 for(unsigned k=0; k<dbH->numFiles; k++)
mas01cr@256 45 if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)+1)==0){
mas01cr@239 46 alreadyInserted=1;
mas01cr@239 47 break;
mas01cr@239 48 }
mas01cr@239 49
mas01cr@239 50 if(alreadyInserted) {
mas01cr@239 51 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
mas01cr@239 52 return;
mas01cr@239 53 }
mas01cr@239 54
mas01cr@239 55 // Make a track index table of features to file indexes
mas01cr@239 56 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01cr@239 57 if(!numVectors) {
mas01cr@239 58 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key);
mas01cr@239 59
mas01cr@239 60 // CLEAN UP
mas01cr@239 61 munmap(indata,statbuf.st_size);
mas01cr@239 62 munmap(db,dbH->dbSize);
mas01cr@239 63 close(infid);
mas01cr@239 64 return;
mas01cr@239 65 }
mas01cr@239 66
mas01cr@256 67 strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key));
mas01cr@239 68
mas01cr@239 69 off_t insertoffset = dbH->length;// Store current state
mas01cr@239 70
mas01cr@239 71 // Check times status and insert times from file
mas01cr@239 72 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
mas01cr@239 73 double *timesdata = timesTable + 2*indexoffset;
mas01cr@239 74
mas01cr@239 75 if(2*(indexoffset + numVectors) > timesTableLength) {
mas01cr@239 76 error("out of space for times", key);
mas01cr@239 77 }
mas01cr@239 78
mas01cr@239 79 if (usingTimes) {
mas01cr@239 80 insertTimeStamps(numVectors, timesFile, timesdata);
mas01cr@239 81 }
mas01cr@239 82
mas01cr@239 83 double *powerdata = powerTable + indexoffset;
mas01cr@239 84 insertPowerData(numVectors, powerfd, powerdata);
mas01cr@239 85
mas01cr@239 86 // Increment file count
mas01cr@239 87 dbH->numFiles++;
mas01cr@239 88
mas01cr@239 89 // Update Header information
mas01cr@239 90 dbH->length+=(statbuf.st_size-sizeof(int));
mas01cr@239 91
mas01cr@239 92 // Update track to file index map
mas01cr@239 93 memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned));
mas01cr@239 94
mas01cr@239 95 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
mas01cr@239 96
mas01cr@239 97 // Norm the vectors on input if the database is already L2 normed
mas01cr@239 98 if(dbH->flags & O2_FLAG_L2NORM)
mas01cr@239 99 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
mas01cr@239 100
mas01cr@239 101 // Report status
mas01cr@239 102 status(dbName);
mas01cr@239 103 VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int)));
mas01cr@239 104
mas01cr@239 105 // Copy the header back to the database
mas01cr@239 106 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01cr@239 107
mas01cr@239 108 // CLEAN UP
mas01cr@239 109 munmap(indata,statbuf.st_size);
mas01cr@239 110 close(infid);
mas01cr@239 111 }
mas01cr@239 112
mas01cr@239 113 void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) {
mas01cr@239 114 assert(usingTimes);
mas01cr@239 115
mas01cr@239 116 unsigned numtimes = 0;
mas01cr@239 117
mas01cr@239 118 if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) {
mas01cr@239 119 dbH->flags=dbH->flags|O2_FLAG_TIMES;
mas01cr@239 120 } else if(!(dbH->flags & O2_FLAG_TIMES)) {
mas01cr@239 121 error("Timestamp file used with non-timestamped database", timesFileName);
mas01cr@239 122 }
mas01cr@239 123
mas01cr@239 124 if(!timesFile->is_open()) {
mas01cr@239 125 error("problem opening times file on timestamped database", timesFileName);
mas01cr@239 126 }
mas01cr@239 127
mas01cr@239 128 double timepoint, next;
mas01cr@239 129 *timesFile >> timepoint;
mas01cr@239 130 if (timesFile->eof()) {
mas01cr@239 131 error("no entries in times file", timesFileName);
mas01cr@239 132 }
mas01cr@239 133 numtimes++;
mas01cr@239 134 do {
mas01cr@239 135 *timesFile >> next;
mas01cr@239 136 if (timesFile->eof()) {
mas01cr@239 137 break;
mas01cr@239 138 }
mas01cr@239 139 numtimes++;
mas01cr@239 140 timesdata[0] = timepoint;
mas01cr@239 141 timepoint = (timesdata[1] = next);
mas01cr@239 142 timesdata += 2;
mas01cr@239 143 } while (numtimes < numVectors + 1);
mas01cr@239 144
mas01cr@239 145 if (numtimes < numVectors + 1) {
mas01cr@239 146 error("too few timepoints in times file", timesFileName);
mas01cr@239 147 }
mas01cr@239 148
mas01cr@239 149 *timesFile >> next;
mas01cr@239 150 if (!timesFile->eof()) {
mas01cr@239 151 error("too many timepoints in times file", timesFileName);
mas01cr@239 152 }
mas01cr@239 153 }
mas01cr@239 154
mas01cr@239 155 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@239 156 if (usingPower) {
mas01cr@239 157 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01cr@239 158 error("Cannot insert power data on non-power DB", dbName);
mas01cr@239 159 }
mas01cr@239 160
mas01cr@239 161 int one;
mas01cr@239 162 unsigned int count;
mas01cr@239 163
mas01cr@239 164 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@239 165 if (count != sizeof(unsigned int)) {
mas01cr@239 166 error("powerfd read failed", "int", "read");
mas01cr@239 167 }
mas01cr@239 168 if (one != 1) {
mas01cr@239 169 error("dimensionality of power file not 1", powerFileName);
mas01cr@239 170 }
mas01cr@239 171
mas01cr@239 172 // FIXME: should check that the powerfile is the right size for
mas01cr@239 173 // this. -- CSR, 2007-10-30
mas01cr@239 174 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@239 175 if (count != numVectors * sizeof(double)) {
mas01cr@239 176 error("powerfd read failed", "double", "read");
mas01cr@239 177 }
mas01cr@239 178 }
mas01cr@239 179 }
mas01cr@239 180
mas01cr@239 181 void audioDB::batchinsert(const char* dbName, const char* inFile) {
mas01cr@239 182
mas01cr@239 183 forWrite = true;
mas01cr@239 184 initDBHeader(dbName);
mas01cr@239 185
mas01cr@239 186 if(!key)
mas01cr@239 187 key=inFile;
mas01cr@239 188 std::ifstream *filesIn = 0;
mas01cr@239 189 std::ifstream *keysIn = 0;
mas01cr@239 190 std::ifstream* thisTimesFile = 0;
mas01cr@239 191 int thispowerfd = 0;
mas01cr@239 192
mas01cr@239 193 if(!(filesIn = new std::ifstream(inFile)))
mas01cr@239 194 error("Could not open batch in file", inFile);
mas01cr@239 195 if(key && key!=inFile)
mas01cr@239 196 if(!(keysIn = new std::ifstream(key)))
mas01cr@239 197 error("Could not open batch key file",key);
mas01cr@239 198
mas01cr@239 199 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01cr@239 200 error("Must use timestamps with timestamped database","use --times");
mas01cr@239 201
mas01cr@239 202 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01cr@239 203 error("Must use power with power-enabled database", dbName);
mas01cr@239 204
mas01cr@239 205 unsigned totalVectors=0;
mas01cr@239 206 char *thisFile = new char[MAXSTR];
mas01cr@262 207 char *thisKey = 0;
mas01cr@262 208 if (key && (key != inFile)) {
mas01cr@262 209 thisKey = new char[MAXSTR];
mas01cr@262 210 }
mas01cr@239 211 char *thisTimesFileName = new char[MAXSTR];
mas01cr@239 212 char *thisPowerFileName = new char[MAXSTR];
mas01cr@239 213
mas01cr@239 214 do{
mas01cr@239 215 filesIn->getline(thisFile,MAXSTR);
mas01cr@262 216 if(key && key!=inFile) {
mas01cr@239 217 keysIn->getline(thisKey,MAXSTR);
mas01cr@262 218 } else {
mas01cr@239 219 thisKey = thisFile;
mas01cr@262 220 }
mas01cr@262 221 if(usingTimes) {
mas01cr@262 222 timesFile->getline(thisTimesFileName,MAXSTR);
mas01cr@262 223 }
mas01cr@262 224 if(usingPower) {
mas01cr@239 225 powerFile->getline(thisPowerFileName, MAXSTR);
mas01cr@262 226 }
mas01cr@239 227
mas01cr@262 228 if(filesIn->eof()) {
mas01cr@239 229 break;
mas01cr@262 230 }
mas01cr@239 231 initInputFile(thisFile);
mas01cr@239 232
mas01cr@251 233 if(!enough_per_file_space_free()) {
mas01cr@251 234 error("batchinsert failed: no more room for metadata", thisFile);
mas01cr@251 235 }
mas01cr@251 236
mas01cr@239 237 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
mas01cr@239 238 error("batchinsert failed: no more room in database", thisFile);
mas01cr@239 239 }
mas01cr@239 240
mas01cr@239 241 // Linear scan of filenames check for pre-existing feature
mas01cr@239 242 unsigned alreadyInserted=0;
mas01cr@239 243
mas01cr@239 244 for(unsigned k=0; k<dbH->numFiles; k++)
mas01cr@256 245 if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){
mas01cr@239 246 alreadyInserted=1;
mas01cr@239 247 break;
mas01cr@239 248 }
mas01cr@239 249
mas01cr@239 250 if(alreadyInserted) {
mas01cr@239 251 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
mas01cr@239 252 } else {
mas01cr@239 253 // Make a track index table of features to file indexes
mas01cr@239 254 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01cr@239 255 if(!numVectors) {
mas01cr@239 256 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
mas01cr@239 257 }
mas01cr@239 258 else{
mas01cr@239 259 if(usingTimes){
mas01cr@239 260 if(timesFile->eof()) {
mas01cr@239 261 error("not enough timestamp files in timesList", timesFileName);
mas01cr@239 262 }
mas01cr@239 263 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
mas01cr@239 264 if(!thisTimesFile->is_open()) {
mas01cr@239 265 error("Cannot open timestamp file", thisTimesFileName);
mas01cr@239 266 }
mas01cr@239 267 off_t insertoffset = dbH->length;
mas01cr@239 268 unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double));
mas01cr@239 269 double *timesdata = timesTable + 2*indexoffset;
mas01cr@239 270 if(2*(indexoffset + numVectors) > timesTableLength) {
mas01cr@239 271 error("out of space for times", key);
mas01cr@239 272 }
mas01cr@239 273 insertTimeStamps(numVectors, thisTimesFile, timesdata);
mas01cr@239 274 if(thisTimesFile)
mas01cr@239 275 delete thisTimesFile;
mas01cr@239 276 }
mas01cr@239 277
mas01cr@239 278 if (usingPower) {
mas01cr@239 279 if(powerFile->eof()) {
mas01cr@239 280 error("not enough power files in powerList", powerFileName);
mas01cr@239 281 }
mas01cr@239 282 thispowerfd = open(thisPowerFileName, O_RDONLY);
mas01cr@239 283 if (thispowerfd < 0) {
mas01cr@239 284 error("failed to open power file", thisPowerFileName);
mas01cr@239 285 }
mas01cr@239 286 off_t insertoffset = dbH->length;
mas01cr@239 287 unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double));
mas01cr@239 288 double *powerdata = powerTable + poweroffset;
mas01cr@239 289 insertPowerData(numVectors, thispowerfd, powerdata);
mas01cr@239 290 if (0 < thispowerfd) {
mas01cr@239 291 close(thispowerfd);
mas01cr@239 292 }
mas01cr@239 293 }
mas01cr@256 294 strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey));
mas01cr@239 295
mas01cr@239 296 off_t insertoffset = dbH->length;// Store current state
mas01cr@239 297
mas01cr@239 298 // Increment file count
mas01cr@239 299 dbH->numFiles++;
mas01cr@239 300
mas01cr@239 301 // Update Header information
mas01cr@239 302 dbH->length+=(statbuf.st_size-sizeof(int));
mas01cr@239 303
mas01cr@239 304 // Update track to file index map
mas01cr@239 305 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
mas01cr@239 306
mas01cr@239 307 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
mas01cr@239 308
mas01cr@239 309 // Norm the vectors on input if the database is already L2 normed
mas01cr@239 310 if(dbH->flags & O2_FLAG_L2NORM)
mas01cr@239 311 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
mas01cr@239 312
mas01cr@239 313 totalVectors+=numVectors;
mas01cr@239 314
mas01cr@239 315 // Copy the header back to the database
mas01cr@239 316 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01cr@239 317 }
mas01cr@239 318 }
mas01cr@239 319 // CLEAN UP
mas01cr@239 320 munmap(indata,statbuf.st_size);
mas01cr@239 321 close(infid);
mas01cr@239 322 } while(!filesIn->eof());
mas01cr@239 323
mas01cr@239 324 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
mas01cr@262 325
mas01cr@262 326 delete [] thisPowerFileName;
mas01cr@262 327 if(key && (key != inFile)) {
mas01cr@262 328 delete [] thisKey;
mas01cr@262 329 }
mas01cr@262 330 delete [] thisFile;
mas01cr@262 331 delete [] thisTimesFileName;
mas01cr@239 332
mas01cr@262 333 delete filesIn;
mas01cr@262 334 delete keysIn;
mas01cr@262 335
mas01cr@239 336 // Report status
mas01cr@239 337 status(dbName);
mas01cr@239 338 }