Mercurial > hg > dml-open-backendtools
view pyspark/ilm/assetDB.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line source
# Part of DML (Digital Music Laboratory) # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #!/usr/bin/env python # encoding: utf-8 """ assetDB.py Created by George Fazekas on 2012-01-16. Modifications by Mathieu Barthet in 2013-12, Steven Hargreaves 22/12/2014. Copyright (c) 2013 . All rights reserved. """ import sys,os,logging import sqlalchemy as sal from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, Sequence, Enum from sqlalchemy.orm import sessionmaker from sqlalchemy.dialects import mysql from hashlib import md5 class assetDB(object): asset_types = ['wav','mpeg/320kbps','mpeg/64kbps'] extensions = ['wav','mp3','mp3'] ext = dict(zip(asset_types,extensions)) def __init__(self, prefix, pref=list(), config=None): self.log = logging.getLogger('spark_feat_extract') self.log.info("ORM Version: %s",sal.__version__) self.config = config self.session = None self.Assets = None self.prefix = prefix if pref : self.asset_prefs = pref else : self.asset_prefs = assetDB.asset_types # reporting errors: self.found_different_asset_type = 0 self.errata_file = None if config and hasattr(config,"db_errata_file") : self.errata_file = config.db_errata_file def connect(self,echo=False): '''Connect to the MySQL database and create a session.''' URL = "mysql://%s:%s@%s/%s" %(self.config.get('Commercial Asset Database', 'user'),self.config.get('Commercial Asset Database', 'passwd'),self.config.get('Commercial Asset Database', 'host'),self.config.get('Commercial Asset Database', 'name')) self.log.info("Connecting to database server at: %s",URL.replace(self.config.get('Commercial Asset Database', 'passwd'),'*****')) engine=sal.create_engine(URL, echo=echo) Session = sessionmaker(bind=engine) self.session = Session() self.log.debug("MySQL session created successfully.") return self def close(self): '''Close the database session''' if self.session : self.session.close() self.log.info("Database closed.") return self def create_mapper(self): '''Create an Object-Relational Mapper''' Base = declarative_base() class Assets(Base): #change #__tablename__ = 'assets' __tablename__ = self.config.get('Commercial Asset Database', 'tablename') # map all table columns to variables here, e.g. # album_id = Column(Integer, primary_key=True) # song_title = Column(String) # genre_id = Column(Integer) self.Assets = Assets return self def get_assets(self,start=0,limit=10,asset_type='audio/x-wav'): '''Returns some assets from the database. If the path given by the specified asset type does not exists, try to find the assets given the preference list provided in self.asset_prefs. If no valid path can be found for an asset, log the error and yield None for the path.''' # limit = start + limit # this changes the semantics of the SQL limit # create the ORM mapper object if doesn't exist if self.Assets == None : self.create_mapper() # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found for asset in self.session.query(self.Assets)[start:limit]: path = self.generate_path(asset,asset_type) if self.validate_path(path) and self.validate_size(path,asset_type): yield path,asset elif not self.asset_prefs : self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) yield None,asset else : #change self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) self.log.warning("Trying other asset types.") path = self.find_preferred_asset_path(asset) if path == None : yield None,asset else : yield path,asset # ensure each asset yields only once pass pass def get_assets_by_genre(self,genre_id,start=0,limit=10,asset_type='audio/x-wav'): '''Returns some assets of the given genre_id from the database. If the path given by the specified asset type does not exists, try to find the assets given the preference list provided in self.asset_prefs. If no valid path can be found for an asset, log the error and yield None for the path.''' # limit = start + limit # this changes the semantics of the SQL limit # create the ORM mapper object if doesn't exist if self.Assets == None : self.create_mapper() # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found for asset in self.session.query(self.Assets).filter(self.Assets.genre_id == genre_id).all()[start:limit]: path = self.generate_path(asset,asset_type) if self.validate_path(path) and self.validate_size(path,asset_type): yield path,asset elif not self.asset_prefs : self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) yield None,asset else : #change self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) self.log.warning("Trying other asset types.") path = self.find_preferred_asset_path(asset) if path == None : yield None,asset else : yield path,asset # ensure each asset yields only once pass pass def find_preferred_asset_path(self,asset): '''Iteratively find a path name for each asset type in asset_prefs and return the first one available. Return None if not found and log this event for error management.''' path = unicode() for asset_type in self.asset_prefs : path = self.generate_path(asset,asset_type) if self.validate_path(path): self.log.info("Asset found but type is different from requested: %s. (Album ID: %i) ",asset.song_title,asset.album_id) self.append_db_errata(path,"Found different asset type for problem case. (%s)"%asset_type) self.found_different_asset_type += 1 if self.validate_size(path,asset_type): return path else : self.log.error("Requested file for asset is worng size, probably corrupt: %s. (Album ID: %i)",asset.song_title,asset.album_id) continue else: self.append_db_errata(path,"File not found.") if len(path) == 0 : self.log.warning("Asset not found for: %s. (Album ID: %i)",asset.song_title,asset.album_id) return None def generate_path(self,asset,asset_type): '''Generate the path name given a asset database object and a requested asset type''' path = '' # need to generate audio file path here return path def validate_path(self,path): '''Validate the generated path name.''' return os.path.isfile(path) def validate_size(self,path,asset_type): '''Check if the file size makes sense.''' size = -1 try : size = int(os.path.getsize(path)) except Exception, e: self.append_db_errata(path,"Unable to determine file size.") self.log.error("Unable to determine file size: %s." %path) self.log.error("Exception %s."%str(e)) return False if size == 0 : self.append_db_errata(path,"File has zero size.") self.log.error("File has zero size: %s."%path) return False if 'wav' in asset_type : # rationale: with very small files some feature extractor plugins fail or output junk if size > 209715200 or size < 209715 : self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0)) return False if 'mpeg' in asset_type : # same assuming about 1:10 compression if size > 41943040 or size < 65536 : self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0)) return False return True def get_different_asset_no(self): '''Return a count of the cases where the preferred asset type was not found''' return self.found_different_asset_type def reset_different_asset_no(self): '''Reset the asset type was not found counter''' self.found_different_asset_type = 0 def append_db_errata(self,filename,reason,metadata=""): '''Append to a file collecting assets present in the DB but not found on disk.''' if not self.errata_file : return False try : with open(self.errata_file,"a+") as ef: if metadata : ef.write("%(filename)s,%(reason)s,%(metadata)s\n"%locals()) else: ef.write("%(filename)s,%(reason)s\n"%locals()) except: self.log.error("Failed to append database errata.")