Mercurial > hg > audiodb
diff index.cpp @ 292:d9a88cfd4ab6
Completed merge of lshlib back to current version of the trunk.
author | mas01mc |
---|---|
date | Tue, 29 Jul 2008 22:01:17 +0000 |
parents | |
children | f922c234462f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/index.cpp Tue Jul 29 22:01:17 2008 +0000 @@ -0,0 +1,504 @@ +// LSH indexing +// +// Construct a persistent LSH table structure +// Store at the same location as dbName +// Naming convention: +// dbName.lsh.${radius}.${sequenceLength} +// +// +// Author: Michael Casey +// Date: 23 June 2008 + +#include "audioDB.h" +#include "ReporterBase.h" + + +/************************* LSH point index to audioDB conversion *****************/ +Uns32T audioDB::index_to_trackID(Uns32T lshID){ + return lshID>>LSH_N_POINT_BITS; +} + +Uns32T audioDB::index_to_trackPos(Uns32T lshID){ + return lshID&LSH_POINT_MASK; +} + +Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){ + return (trackID << LSH_N_POINT_BITS) | spos; +} + +/************************* LSH indexing and query initialization *****************/ + +char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){ + char* indexName = new char[MAXSTR]; + // Attempt to make new file + if(strlen(dbName) > (MAXSTR - 32)) + error("dbName is too long for LSH index filename appendages"); + strncpy(indexName, dbName, MAXSTR); + sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength); + return indexName; +} + +// return true if index exists else return false +int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){ + // Test to see if file exists + char* indexName = index_get_name(dbName, radius, sequenceLength); + lshfid = open (indexName, O_RDONLY); + delete[] indexName; + close(lshfid); + + if(lshfid<0) + return false; + else + return true; +} + +vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){ + if(vv) + delete vv; + vv = new vector<vector<float> >(sz); + for(Uns32T i=0 ; i < sz ; i++) + (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage + return vv; +} + +/******************** LSH indexing audioDB database access forall s \in {S} ***********************/ + +// Prepare the AudioDB database for read access and allocate auxillary memory +void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { + *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors + *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector + + double *snpp = *snp, *sppp = 0; + memcpy(*snp, l2normTable, *dvp * sizeof(double)); + + if (!(dbH->flags & O2_FLAG_POWER)) { + error("database not power-enabled", dbName); + } + *spp = new double[*dvp]; // song powertable pointer + sppp = *spp; + memcpy(*spp, powerTable, *dvp * sizeof(double)); + + for(Uns32T i = 0; i < dbH->numFiles; i++){ + if(trackTable[i] >= sequenceLength) { + sequence_sum(snpp, trackTable[i], sequenceLength); + sequence_sqrt(snpp, trackTable[i], sequenceLength); + + sequence_sum(sppp, trackTable[i], sequenceLength); + sequence_average(sppp, trackTable[i], sequenceLength); + } + snpp += trackTable[i]; + sppp += trackTable[i]; + } + + *vsnp = *snp; + *vspp = *spp; + + // Move the feature vector read pointer to start of fetures in database + lseek(dbfid, dbH->dataOffset, SEEK_SET); +} + + +/************************ LSH indexing ***********************************/ +void audioDB::index_index_db(const char* dbName){ + + char* newIndexName; + double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; + Uns32T dbVectors = 0; + + printf("INDEX: initializing header\n"); + // Check if audioDB exists, initialize header and open database for read + forWrite = false; + initDBHeader(dbName); + + newIndexName = index_get_name(dbName, radius, sequenceLength); + + // Set unit norming flag override + audioDB::normalizedDistance = !audioDB::no_unit_norming; + + printf("INDEX: dim %d\n", dbH->dim); + printf("INDEX: R %f\n", radius); + printf("INDEX: seqlen %d\n", sequenceLength); + printf("INDEX: lsh_w %f\n", lsh_param_w); + printf("INDEX: lsh_k %d\n", lsh_param_k); + printf("INDEX: lsh_m %d\n", lsh_param_m); + printf("INDEX: lsh_N %d\n", lsh_param_N); + printf("INDEX: lsh_b %d\n", lsh_param_b); + printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); + fflush(stdout); + + + index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); + + if((lshfid = open(newIndexName,O_RDONLY))<0){ + printf("INDEX: constructing new LSH index\n"); + printf("INDEX: making index file %s\n", newIndexName); + fflush(stdout); + // Construct new LSH index + lsh = new LSH((float)lsh_param_w, lsh_param_k, + lsh_param_m, + (Uns32T)(sequenceLength*dbH->dim), + lsh_param_N, + lsh_param_ncols, + (float)radius); + assert(lsh); + + Uns32T endTrack = lsh_param_b; + if( endTrack > dbH->numFiles) + endTrack = dbH->numFiles; + // Insert up to lsh_param_b tracks + index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); + lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); + + // Clean up + delete lsh; + close(lshfid); + } + + // Attempt to open LSH file + if((lshfid = open(newIndexName,O_RDONLY))>0){ + printf("INDEX: merging with existing LSH index\n"); + fflush(stdout); + + // Get the lsh header info and find how many tracks are inserted already + lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here + assert(lsh); + Uns32T maxs = index_to_trackID(lsh->get_maxp())+1; + delete lsh; + + // This allows for updating index after more tracks are inserted into audioDB + for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ + + Uns32T endTrack = startTrack + lsh_param_b; + if( endTrack > dbH->numFiles) + endTrack = dbH->numFiles; + printf("Indexing track range: %d - %d\n", startTrack, endTrack); + fflush(stdout); + lsh = new LSH(newIndexName, lsh_in_core); // Initialize core memory for LSH tables + assert(lsh); + + // Insert up to lsh_param_b database tracks + index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); + + // Serialize to file + lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk + delete lsh; + } + + close(lshfid); + printf("INDEX: done constructing LSH index.\n"); + fflush(stdout); + + } + else{ + error("Something's wrong with LSH index file"); + exit(1); + } + + + delete[] newIndexName; + if(sNorm) + delete[] sNorm; + if(sPower) + delete[] sPower; + + +} + +void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, + double** fvpp, double** sNormpp,double** snPtrp, + double** sPowerp, double** spPtrp){ + size_t nfv = 0; + double* fvp = 0; // Keep pointer for memory allocation and free() for track data + Uns32T trackID = 0; + + VERB_LOG(1, "indexing tracks..."); + + + for(trackID = start_track ; trackID < end_track ; trackID++ ){ + read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv + *fvpp = fvp; // Protect memory allocation and free() for track data + if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) + break; + } + std::cout << "finished inserting." << endl; +} + +int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ + // Loop over the current input track's vectors + Uns32T numVecs = (trackTable[trackID]>O2_MAXTRACKLEN?O2_MAXTRACKLEN:trackTable[trackID]) - sequenceLength + 1 ; + vv = index_initialize_shingles(numVecs); + + for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) + index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); + + Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); + Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp); + float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; + + /* index_norm_shingles() only goes as far as the end of the + sequence, which is right, but the space allocated is for the + whole track. */ + + /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN + * So let's be certain the pointers are in the correct place + */ + + *snpp += trackTable[trackID]; + *sppp += trackTable[trackID]; + *fvpp += trackTable[trackID] * dbH->dim; + + std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; + std::cout.flush(); + return true; +} + +Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){ + Uns32T collisionCount = 0; + cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; + for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID++) + if(!use_absolute_threshold || (use_absolute_threshold && (*spp++ >= absolute_threshold))) + collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID)); + return collisionCount; +} + +/********************* LSH shingle construction ***************************/ + +// Construct shingles out of a feature matrix +// inputs: +// idx is vector index in feature matrix +// fvp is base feature matrix pointer double* [numVecs x dbH->dim] +// +// pre-conditions: +// dbH->dim +// sequenceLength +// idx < numVectors - sequenceLength + 1 +// +// post-conditions: +// (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values + +void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){ + assert(idx<(*vv).size()); + vector<float>::iterator ve = (*vv)[idx].end(); + vi=(*vv)[idx].begin(); // shingle iterator + // First feature vector in shingle + if(idx==0){ + while(vi!=ve) + *vi++ = (float)(*fvp++); + } + // Not first feature vector in shingle + else{ + vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator + // Previous seqLen-1 dim-vectors + while(vi!=ve-dim) + *vi++=*ui++; + // Move data pointer to next feature vector + fvp += ( seqLen + idx - 1 ) * dim ; + // New d-vector + while(vi!=ve) + *vi++ = (float)(*fvp++); + } +} + +// norm shingles +// in-place norming, no deletions +// If using power, return number of shingles above power threshold +int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){ + int z = 0; // number of above-threshold shingles + float l2norm; + double power; + float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2 + float oneOverSqrtl2NormDivRad = oneOverRadius; + if(!spp) + error("LSH indexing and query requires a power feature using -w or -W"); + Uns32T shingleSize = sequenceLength*dbH->dim; + for(Uns32T a=0; a<(*vv).size(); a++){ + l2norm = (float)(*snp++); + if(audioDB::normalizedDistance) + oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius; + + for(Uns32T b=0; b < shingleSize ; b++) + (*vv)[a][b]*=oneOverSqrtl2NormDivRad; + + power = *spp++; + if(use_absolute_threshold){ + if ( power >= absolute_threshold ) + z++; + } + else + z++; + } + return z; +} + + +/*********************** LSH retrieval ****************************/ + + +// return true if indexed query performed else return false +int audioDB::index_init_query(const char* dbName){ + + if(!(index_exists(dbName, radius, sequenceLength))) + return false; + + char* indexName = index_get_name(dbName, radius, sequenceLength); + + // Test to see if file exists + if((lshfid = open (indexName, O_RDONLY)) < 0){ + delete[] indexName; + return false; + } + + printf("INDEX: initializing header\n"); + + lsh = new LSH(indexName, false); // Get the header only here + assert(lsh); + sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim + + + if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE)) + printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius()); + + printf("INDEX: dim %d\n", dbH->dim); + printf("INDEX: R %f\n", lsh->get_radius()); + printf("INDEX: seqlen %d\n", sequenceLength); + printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth()); + printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns()); + printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables()); + printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows()); + printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp())); + printf("INDEX: Opened LSH index file %s\n", indexName); + fflush(stdout); + + // Check to see if we are loading hash tables into core, and do so if true + if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){ + printf("INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1"); + delete lsh; + lsh = new LSH(indexName, true); + } + + delete[] indexName; + return true; +} + +// *Static* approximate NN point reporter callback method for lshlib +void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ + assert(instancePtr); // We need an instance for this callback + audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance + Uns32T trackID = index_to_trackID(pointID); + Uns32T spos = index_to_trackPos(pointID); + // Skip identity in query_from_key + if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) + myself->reporter->add_point(trackID, qpos, spos, dist); +} + +// *Static* exact NN point reporter callback method for lshlib +// Maintain a queue of points to pass to query_points() for exact evaluation +void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ + assert(instancePtr); // We need an instance for this callback + audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance + Uns32T trackID = index_to_trackID(pointID); + Uns32T spos = index_to_trackPos(pointID); + // Skip identity in query_from_key + if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) + myself->index_insert_exact_evaluation_queue(trackID, qpos, spos); +} + +void audioDB::initialize_exact_evalutation_queue(){ + if(exact_evaluation_queue) + delete exact_evaluation_queue; + exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >; +} + +void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){ + PointPair p(trackID, qpos, spos); + exact_evaluation_queue->push(p); +} + +// return 0: if index does not exist +// return nqv: if index exists +int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) { + + unsigned int numVectors; + double *query, *query_data; + double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; + double meanQdur; + void (*add_point_func)(void*,Uns32T,Uns32T,float); + + // Set the point-reporter callback based on the value of lsh_exact + if(lsh_exact){ + initialize_exact_evalutation_queue(); + add_point_func = &index_add_point_exact; + } + else + add_point_func = &index_add_point_approximate; + + if(!index_init_query(dbName)) // sets-up LSH index structures for querying + return 0; + + char* database = index_get_name(dbName, radius, sequenceLength); + + if(query_from_key) + set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); + else + set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors + + VERB_LOG(1, "retrieving tracks..."); + + assert(pointNN>0 && pointNN<=O2_MAXNN); + assert(trackNN>0 && trackNN<=O2_MAXNN); + + gettimeofday(&tv1, NULL); + // query vector index + Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1; + vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles + cout << "Nq=" << Nq; cout.flush(); + // Construct shingles from query features + for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ ) + index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength); + + // Normalize query vectors + Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr ); + cout << " Nq'=" << numVecsAboveThreshold << endl; cout.flush(); + + // Nq contains number of inspected points in query file, + // numVecsAboveThreshold is number of points with power >= absolute_threshold + double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation + if(usingQueryPoint && numVecsAboveThreshold){ + if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) + lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this); + else + lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this); + } + else if(numVecsAboveThreshold) + for( Uns32T pointID = 0 ; pointID < Nq; pointID++ ) + if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) + if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) + lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this); + else + lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this); + + if(lsh_exact) + // Perform exact distance computation on point pairs in exact_evaluation_queue + query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors); + + gettimeofday(&tv2,NULL); + VERB_LOG(1,"elapsed time: %ld msec\n", + (tv2.tv_sec*1000 + tv2.tv_usec/1000) - + (tv1.tv_sec*1000 + tv1.tv_usec/1000)) + + // Close the index file + close(lshfid); + + // Clean up + if(query_data) + delete[] query_data; + if(qNorm) + delete[] qNorm; + if(qPower) + delete[] qPower; + if(database) + delete[] database; + + return Nq; +} +