comparison insert.cpp @ 239:2cc06e5b05a5

Merge refactoring branch. Bug fixes: * 64-bit powertable bug; * -inf - -inf bug; * use new times information; * plus short track, O2_MAXFILES and structure padding ABI fixes (already backported) Major code changes: * split source into functional units, known as 'files'; * Reporter class for accumulating and reporting on query results; * much OAOOization, mostly from above: net 800 LOC (25%) shorter.
author mas01cr
date Thu, 13 Dec 2007 14:23:32 +0000
parents
children a6c9a1c68646 abfb26e08d9c
comparison
equal deleted inserted replaced
224:3a81da6fb1d7 239:2cc06e5b05a5
1 #include "audioDB.h"
2
3 bool audioDB::enough_data_space_free(off_t size) {
4 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
5 }
6
7 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
8 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET);
9 write(dbfid, buffer, size);
10 }
11
12 void audioDB::insert(const char* dbName, const char* inFile) {
13 forWrite = true;
14 initTables(dbName, inFile);
15
16 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
17 error("Must use timestamps with timestamped database","use --times");
18
19 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
20 error("Must use power with power-enabled database", dbName);
21
22 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
23 error("Insert failed: no more room in database", inFile);
24 }
25
26 if(!key)
27 key=inFile;
28 // Linear scan of filenames check for pre-existing feature
29 unsigned alreadyInserted=0;
30 for(unsigned k=0; k<dbH->numFiles; k++)
31 if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key)+1)==0){
32 alreadyInserted=1;
33 break;
34 }
35
36 if(alreadyInserted) {
37 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
38 return;
39 }
40
41 // Make a track index table of features to file indexes
42 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
43 if(!numVectors) {
44 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key);
45
46 // CLEAN UP
47 munmap(indata,statbuf.st_size);
48 munmap(db,dbH->dbSize);
49 close(infid);
50 return;
51 }
52
53 strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key));
54
55 off_t insertoffset = dbH->length;// Store current state
56
57 // Check times status and insert times from file
58 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
59 double *timesdata = timesTable + 2*indexoffset;
60
61 if(2*(indexoffset + numVectors) > timesTableLength) {
62 error("out of space for times", key);
63 }
64
65 if (usingTimes) {
66 insertTimeStamps(numVectors, timesFile, timesdata);
67 }
68
69 double *powerdata = powerTable + indexoffset;
70 insertPowerData(numVectors, powerfd, powerdata);
71
72 // Increment file count
73 dbH->numFiles++;
74
75 // Update Header information
76 dbH->length+=(statbuf.st_size-sizeof(int));
77
78 // Update track to file index map
79 memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned));
80
81 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
82
83 // Norm the vectors on input if the database is already L2 normed
84 if(dbH->flags & O2_FLAG_L2NORM)
85 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
86
87 // Report status
88 status(dbName);
89 VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int)));
90
91 // Copy the header back to the database
92 memcpy (db, dbH, sizeof(dbTableHeaderT));
93
94 // CLEAN UP
95 munmap(indata,statbuf.st_size);
96 close(infid);
97 }
98
99 void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) {
100 assert(usingTimes);
101
102 unsigned numtimes = 0;
103
104 if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) {
105 dbH->flags=dbH->flags|O2_FLAG_TIMES;
106 } else if(!(dbH->flags & O2_FLAG_TIMES)) {
107 error("Timestamp file used with non-timestamped database", timesFileName);
108 }
109
110 if(!timesFile->is_open()) {
111 error("problem opening times file on timestamped database", timesFileName);
112 }
113
114 double timepoint, next;
115 *timesFile >> timepoint;
116 if (timesFile->eof()) {
117 error("no entries in times file", timesFileName);
118 }
119 numtimes++;
120 do {
121 *timesFile >> next;
122 if (timesFile->eof()) {
123 break;
124 }
125 numtimes++;
126 timesdata[0] = timepoint;
127 timepoint = (timesdata[1] = next);
128 timesdata += 2;
129 } while (numtimes < numVectors + 1);
130
131 if (numtimes < numVectors + 1) {
132 error("too few timepoints in times file", timesFileName);
133 }
134
135 *timesFile >> next;
136 if (!timesFile->eof()) {
137 error("too many timepoints in times file", timesFileName);
138 }
139 }
140
141 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
142 if (usingPower) {
143 if (!(dbH->flags & O2_FLAG_POWER)) {
144 error("Cannot insert power data on non-power DB", dbName);
145 }
146
147 int one;
148 unsigned int count;
149
150 count = read(powerfd, &one, sizeof(unsigned int));
151 if (count != sizeof(unsigned int)) {
152 error("powerfd read failed", "int", "read");
153 }
154 if (one != 1) {
155 error("dimensionality of power file not 1", powerFileName);
156 }
157
158 // FIXME: should check that the powerfile is the right size for
159 // this. -- CSR, 2007-10-30
160 count = read(powerfd, powerdata, numVectors * sizeof(double));
161 if (count != numVectors * sizeof(double)) {
162 error("powerfd read failed", "double", "read");
163 }
164 }
165 }
166
167 void audioDB::batchinsert(const char* dbName, const char* inFile) {
168
169 forWrite = true;
170 initDBHeader(dbName);
171
172 if(!key)
173 key=inFile;
174 std::ifstream *filesIn = 0;
175 std::ifstream *keysIn = 0;
176 std::ifstream* thisTimesFile = 0;
177 int thispowerfd = 0;
178
179 if(!(filesIn = new std::ifstream(inFile)))
180 error("Could not open batch in file", inFile);
181 if(key && key!=inFile)
182 if(!(keysIn = new std::ifstream(key)))
183 error("Could not open batch key file",key);
184
185 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
186 error("Must use timestamps with timestamped database","use --times");
187
188 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
189 error("Must use power with power-enabled database", dbName);
190
191 unsigned totalVectors=0;
192 char *thisKey = new char[MAXSTR];
193 char *thisFile = new char[MAXSTR];
194 char *thisTimesFileName = new char[MAXSTR];
195 char *thisPowerFileName = new char[MAXSTR];
196
197 do{
198 filesIn->getline(thisFile,MAXSTR);
199 if(key && key!=inFile)
200 keysIn->getline(thisKey,MAXSTR);
201 else
202 thisKey = thisFile;
203 if(usingTimes)
204 timesFile->getline(thisTimesFileName,MAXSTR);
205 if(usingPower)
206 powerFile->getline(thisPowerFileName, MAXSTR);
207
208 if(filesIn->eof())
209 break;
210
211 initInputFile(thisFile);
212
213 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
214 error("batchinsert failed: no more room in database", thisFile);
215 }
216
217 // Linear scan of filenames check for pre-existing feature
218 unsigned alreadyInserted=0;
219
220 for(unsigned k=0; k<dbH->numFiles; k++)
221 if(strncmp(fileTable + k*O2_FILETABLESIZE, thisKey, strlen(thisKey)+1)==0){
222 alreadyInserted=1;
223 break;
224 }
225
226 if(alreadyInserted) {
227 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
228 } else {
229 // Make a track index table of features to file indexes
230 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
231 if(!numVectors) {
232 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
233 }
234 else{
235 if(usingTimes){
236 if(timesFile->eof()) {
237 error("not enough timestamp files in timesList", timesFileName);
238 }
239 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
240 if(!thisTimesFile->is_open()) {
241 error("Cannot open timestamp file", thisTimesFileName);
242 }
243 off_t insertoffset = dbH->length;
244 unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double));
245 double *timesdata = timesTable + 2*indexoffset;
246 if(2*(indexoffset + numVectors) > timesTableLength) {
247 error("out of space for times", key);
248 }
249 insertTimeStamps(numVectors, thisTimesFile, timesdata);
250 if(thisTimesFile)
251 delete thisTimesFile;
252 }
253
254 if (usingPower) {
255 if(powerFile->eof()) {
256 error("not enough power files in powerList", powerFileName);
257 }
258 thispowerfd = open(thisPowerFileName, O_RDONLY);
259 if (thispowerfd < 0) {
260 error("failed to open power file", thisPowerFileName);
261 }
262 off_t insertoffset = dbH->length;
263 unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double));
264 double *powerdata = powerTable + poweroffset;
265 insertPowerData(numVectors, thispowerfd, powerdata);
266 if (0 < thispowerfd) {
267 close(thispowerfd);
268 }
269 }
270 strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey));
271
272 off_t insertoffset = dbH->length;// Store current state
273
274 // Increment file count
275 dbH->numFiles++;
276
277 // Update Header information
278 dbH->length+=(statbuf.st_size-sizeof(int));
279
280 // Update track to file index map
281 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
282
283 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
284
285 // Norm the vectors on input if the database is already L2 normed
286 if(dbH->flags & O2_FLAG_L2NORM)
287 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
288
289 totalVectors+=numVectors;
290
291 // Copy the header back to the database
292 memcpy (db, dbH, sizeof(dbTableHeaderT));
293 }
294 }
295 // CLEAN UP
296 munmap(indata,statbuf.st_size);
297 close(infid);
298 } while(!filesIn->eof());
299
300 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
301
302 // Report status
303 status(dbName);
304 }