annotate insert.cpp @ 239:2cc06e5b05a5

Merge refactoring branch. Bug fixes: * 64-bit powertable bug; * -inf - -inf bug; * use new times information; * plus short track, O2_MAXFILES and structure padding ABI fixes (already backported) Major code changes: * split source into functional units, known as 'files'; * Reporter class for accumulating and reporting on query results; * much OAOOization, mostly from above: net 800 LOC (25%) shorter.
author mas01cr
date Thu, 13 Dec 2007 14:23:32 +0000
parents
children a6c9a1c68646 abfb26e08d9c
rev   line source
mas01cr@239 1 #include "audioDB.h"
mas01cr@239 2
mas01cr@239 3 bool audioDB::enough_data_space_free(off_t size) {
mas01cr@239 4 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
mas01cr@239 5 }
mas01cr@239 6
mas01cr@239 7 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
mas01cr@239 8 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET);
mas01cr@239 9 write(dbfid, buffer, size);
mas01cr@239 10 }
mas01cr@239 11
mas01cr@239 12 void audioDB::insert(const char* dbName, const char* inFile) {
mas01cr@239 13 forWrite = true;
mas01cr@239 14 initTables(dbName, inFile);
mas01cr@239 15
mas01cr@239 16 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01cr@239 17 error("Must use timestamps with timestamped database","use --times");
mas01cr@239 18
mas01cr@239 19 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01cr@239 20 error("Must use power with power-enabled database", dbName);
mas01cr@239 21
mas01cr@239 22 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
mas01cr@239 23 error("Insert failed: no more room in database", inFile);
mas01cr@239 24 }
mas01cr@239 25
mas01cr@239 26 if(!key)
mas01cr@239 27 key=inFile;
mas01cr@239 28 // Linear scan of filenames check for pre-existing feature
mas01cr@239 29 unsigned alreadyInserted=0;
mas01cr@239 30 for(unsigned k=0; k<dbH->numFiles; k++)
mas01cr@239 31 if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key)+1)==0){
mas01cr@239 32 alreadyInserted=1;
mas01cr@239 33 break;
mas01cr@239 34 }
mas01cr@239 35
mas01cr@239 36 if(alreadyInserted) {
mas01cr@239 37 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
mas01cr@239 38 return;
mas01cr@239 39 }
mas01cr@239 40
mas01cr@239 41 // Make a track index table of features to file indexes
mas01cr@239 42 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01cr@239 43 if(!numVectors) {
mas01cr@239 44 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key);
mas01cr@239 45
mas01cr@239 46 // CLEAN UP
mas01cr@239 47 munmap(indata,statbuf.st_size);
mas01cr@239 48 munmap(db,dbH->dbSize);
mas01cr@239 49 close(infid);
mas01cr@239 50 return;
mas01cr@239 51 }
mas01cr@239 52
mas01cr@239 53 strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key));
mas01cr@239 54
mas01cr@239 55 off_t insertoffset = dbH->length;// Store current state
mas01cr@239 56
mas01cr@239 57 // Check times status and insert times from file
mas01cr@239 58 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
mas01cr@239 59 double *timesdata = timesTable + 2*indexoffset;
mas01cr@239 60
mas01cr@239 61 if(2*(indexoffset + numVectors) > timesTableLength) {
mas01cr@239 62 error("out of space for times", key);
mas01cr@239 63 }
mas01cr@239 64
mas01cr@239 65 if (usingTimes) {
mas01cr@239 66 insertTimeStamps(numVectors, timesFile, timesdata);
mas01cr@239 67 }
mas01cr@239 68
mas01cr@239 69 double *powerdata = powerTable + indexoffset;
mas01cr@239 70 insertPowerData(numVectors, powerfd, powerdata);
mas01cr@239 71
mas01cr@239 72 // Increment file count
mas01cr@239 73 dbH->numFiles++;
mas01cr@239 74
mas01cr@239 75 // Update Header information
mas01cr@239 76 dbH->length+=(statbuf.st_size-sizeof(int));
mas01cr@239 77
mas01cr@239 78 // Update track to file index map
mas01cr@239 79 memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned));
mas01cr@239 80
mas01cr@239 81 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
mas01cr@239 82
mas01cr@239 83 // Norm the vectors on input if the database is already L2 normed
mas01cr@239 84 if(dbH->flags & O2_FLAG_L2NORM)
mas01cr@239 85 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
mas01cr@239 86
mas01cr@239 87 // Report status
mas01cr@239 88 status(dbName);
mas01cr@239 89 VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int)));
mas01cr@239 90
mas01cr@239 91 // Copy the header back to the database
mas01cr@239 92 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01cr@239 93
mas01cr@239 94 // CLEAN UP
mas01cr@239 95 munmap(indata,statbuf.st_size);
mas01cr@239 96 close(infid);
mas01cr@239 97 }
mas01cr@239 98
mas01cr@239 99 void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) {
mas01cr@239 100 assert(usingTimes);
mas01cr@239 101
mas01cr@239 102 unsigned numtimes = 0;
mas01cr@239 103
mas01cr@239 104 if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) {
mas01cr@239 105 dbH->flags=dbH->flags|O2_FLAG_TIMES;
mas01cr@239 106 } else if(!(dbH->flags & O2_FLAG_TIMES)) {
mas01cr@239 107 error("Timestamp file used with non-timestamped database", timesFileName);
mas01cr@239 108 }
mas01cr@239 109
mas01cr@239 110 if(!timesFile->is_open()) {
mas01cr@239 111 error("problem opening times file on timestamped database", timesFileName);
mas01cr@239 112 }
mas01cr@239 113
mas01cr@239 114 double timepoint, next;
mas01cr@239 115 *timesFile >> timepoint;
mas01cr@239 116 if (timesFile->eof()) {
mas01cr@239 117 error("no entries in times file", timesFileName);
mas01cr@239 118 }
mas01cr@239 119 numtimes++;
mas01cr@239 120 do {
mas01cr@239 121 *timesFile >> next;
mas01cr@239 122 if (timesFile->eof()) {
mas01cr@239 123 break;
mas01cr@239 124 }
mas01cr@239 125 numtimes++;
mas01cr@239 126 timesdata[0] = timepoint;
mas01cr@239 127 timepoint = (timesdata[1] = next);
mas01cr@239 128 timesdata += 2;
mas01cr@239 129 } while (numtimes < numVectors + 1);
mas01cr@239 130
mas01cr@239 131 if (numtimes < numVectors + 1) {
mas01cr@239 132 error("too few timepoints in times file", timesFileName);
mas01cr@239 133 }
mas01cr@239 134
mas01cr@239 135 *timesFile >> next;
mas01cr@239 136 if (!timesFile->eof()) {
mas01cr@239 137 error("too many timepoints in times file", timesFileName);
mas01cr@239 138 }
mas01cr@239 139 }
mas01cr@239 140
mas01cr@239 141 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@239 142 if (usingPower) {
mas01cr@239 143 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01cr@239 144 error("Cannot insert power data on non-power DB", dbName);
mas01cr@239 145 }
mas01cr@239 146
mas01cr@239 147 int one;
mas01cr@239 148 unsigned int count;
mas01cr@239 149
mas01cr@239 150 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@239 151 if (count != sizeof(unsigned int)) {
mas01cr@239 152 error("powerfd read failed", "int", "read");
mas01cr@239 153 }
mas01cr@239 154 if (one != 1) {
mas01cr@239 155 error("dimensionality of power file not 1", powerFileName);
mas01cr@239 156 }
mas01cr@239 157
mas01cr@239 158 // FIXME: should check that the powerfile is the right size for
mas01cr@239 159 // this. -- CSR, 2007-10-30
mas01cr@239 160 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@239 161 if (count != numVectors * sizeof(double)) {
mas01cr@239 162 error("powerfd read failed", "double", "read");
mas01cr@239 163 }
mas01cr@239 164 }
mas01cr@239 165 }
mas01cr@239 166
mas01cr@239 167 void audioDB::batchinsert(const char* dbName, const char* inFile) {
mas01cr@239 168
mas01cr@239 169 forWrite = true;
mas01cr@239 170 initDBHeader(dbName);
mas01cr@239 171
mas01cr@239 172 if(!key)
mas01cr@239 173 key=inFile;
mas01cr@239 174 std::ifstream *filesIn = 0;
mas01cr@239 175 std::ifstream *keysIn = 0;
mas01cr@239 176 std::ifstream* thisTimesFile = 0;
mas01cr@239 177 int thispowerfd = 0;
mas01cr@239 178
mas01cr@239 179 if(!(filesIn = new std::ifstream(inFile)))
mas01cr@239 180 error("Could not open batch in file", inFile);
mas01cr@239 181 if(key && key!=inFile)
mas01cr@239 182 if(!(keysIn = new std::ifstream(key)))
mas01cr@239 183 error("Could not open batch key file",key);
mas01cr@239 184
mas01cr@239 185 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
mas01cr@239 186 error("Must use timestamps with timestamped database","use --times");
mas01cr@239 187
mas01cr@239 188 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
mas01cr@239 189 error("Must use power with power-enabled database", dbName);
mas01cr@239 190
mas01cr@239 191 unsigned totalVectors=0;
mas01cr@239 192 char *thisKey = new char[MAXSTR];
mas01cr@239 193 char *thisFile = new char[MAXSTR];
mas01cr@239 194 char *thisTimesFileName = new char[MAXSTR];
mas01cr@239 195 char *thisPowerFileName = new char[MAXSTR];
mas01cr@239 196
mas01cr@239 197 do{
mas01cr@239 198 filesIn->getline(thisFile,MAXSTR);
mas01cr@239 199 if(key && key!=inFile)
mas01cr@239 200 keysIn->getline(thisKey,MAXSTR);
mas01cr@239 201 else
mas01cr@239 202 thisKey = thisFile;
mas01cr@239 203 if(usingTimes)
mas01cr@239 204 timesFile->getline(thisTimesFileName,MAXSTR);
mas01cr@239 205 if(usingPower)
mas01cr@239 206 powerFile->getline(thisPowerFileName, MAXSTR);
mas01cr@239 207
mas01cr@239 208 if(filesIn->eof())
mas01cr@239 209 break;
mas01cr@239 210
mas01cr@239 211 initInputFile(thisFile);
mas01cr@239 212
mas01cr@239 213 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
mas01cr@239 214 error("batchinsert failed: no more room in database", thisFile);
mas01cr@239 215 }
mas01cr@239 216
mas01cr@239 217 // Linear scan of filenames check for pre-existing feature
mas01cr@239 218 unsigned alreadyInserted=0;
mas01cr@239 219
mas01cr@239 220 for(unsigned k=0; k<dbH->numFiles; k++)
mas01cr@239 221 if(strncmp(fileTable + k*O2_FILETABLESIZE, thisKey, strlen(thisKey)+1)==0){
mas01cr@239 222 alreadyInserted=1;
mas01cr@239 223 break;
mas01cr@239 224 }
mas01cr@239 225
mas01cr@239 226 if(alreadyInserted) {
mas01cr@239 227 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
mas01cr@239 228 } else {
mas01cr@239 229 // Make a track index table of features to file indexes
mas01cr@239 230 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
mas01cr@239 231 if(!numVectors) {
mas01cr@239 232 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
mas01cr@239 233 }
mas01cr@239 234 else{
mas01cr@239 235 if(usingTimes){
mas01cr@239 236 if(timesFile->eof()) {
mas01cr@239 237 error("not enough timestamp files in timesList", timesFileName);
mas01cr@239 238 }
mas01cr@239 239 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
mas01cr@239 240 if(!thisTimesFile->is_open()) {
mas01cr@239 241 error("Cannot open timestamp file", thisTimesFileName);
mas01cr@239 242 }
mas01cr@239 243 off_t insertoffset = dbH->length;
mas01cr@239 244 unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double));
mas01cr@239 245 double *timesdata = timesTable + 2*indexoffset;
mas01cr@239 246 if(2*(indexoffset + numVectors) > timesTableLength) {
mas01cr@239 247 error("out of space for times", key);
mas01cr@239 248 }
mas01cr@239 249 insertTimeStamps(numVectors, thisTimesFile, timesdata);
mas01cr@239 250 if(thisTimesFile)
mas01cr@239 251 delete thisTimesFile;
mas01cr@239 252 }
mas01cr@239 253
mas01cr@239 254 if (usingPower) {
mas01cr@239 255 if(powerFile->eof()) {
mas01cr@239 256 error("not enough power files in powerList", powerFileName);
mas01cr@239 257 }
mas01cr@239 258 thispowerfd = open(thisPowerFileName, O_RDONLY);
mas01cr@239 259 if (thispowerfd < 0) {
mas01cr@239 260 error("failed to open power file", thisPowerFileName);
mas01cr@239 261 }
mas01cr@239 262 off_t insertoffset = dbH->length;
mas01cr@239 263 unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double));
mas01cr@239 264 double *powerdata = powerTable + poweroffset;
mas01cr@239 265 insertPowerData(numVectors, thispowerfd, powerdata);
mas01cr@239 266 if (0 < thispowerfd) {
mas01cr@239 267 close(thispowerfd);
mas01cr@239 268 }
mas01cr@239 269 }
mas01cr@239 270 strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey));
mas01cr@239 271
mas01cr@239 272 off_t insertoffset = dbH->length;// Store current state
mas01cr@239 273
mas01cr@239 274 // Increment file count
mas01cr@239 275 dbH->numFiles++;
mas01cr@239 276
mas01cr@239 277 // Update Header information
mas01cr@239 278 dbH->length+=(statbuf.st_size-sizeof(int));
mas01cr@239 279
mas01cr@239 280 // Update track to file index map
mas01cr@239 281 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
mas01cr@239 282
mas01cr@239 283 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
mas01cr@239 284
mas01cr@239 285 // Norm the vectors on input if the database is already L2 normed
mas01cr@239 286 if(dbH->flags & O2_FLAG_L2NORM)
mas01cr@239 287 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
mas01cr@239 288
mas01cr@239 289 totalVectors+=numVectors;
mas01cr@239 290
mas01cr@239 291 // Copy the header back to the database
mas01cr@239 292 memcpy (db, dbH, sizeof(dbTableHeaderT));
mas01cr@239 293 }
mas01cr@239 294 }
mas01cr@239 295 // CLEAN UP
mas01cr@239 296 munmap(indata,statbuf.st_size);
mas01cr@239 297 close(infid);
mas01cr@239 298 } while(!filesIn->eof());
mas01cr@239 299
mas01cr@239 300 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
mas01cr@239 301
mas01cr@239 302 // Report status
mas01cr@239 303 status(dbName);
mas01cr@239 304 }