Mercurial > hg > dml-open-backendtools
diff pyspark/ilm/assetDB.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyspark/ilm/assetDB.py Sat Feb 20 18:14:24 2016 +0100 @@ -0,0 +1,229 @@ +# Part of DML (Digital Music Laboratory) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +#!/usr/bin/env python +# encoding: utf-8 +""" +assetDB.py + +Created by George Fazekas on 2012-01-16. Modifications by Mathieu Barthet in 2013-12, +Steven Hargreaves 22/12/2014. +Copyright (c) 2013 . All rights reserved. +""" + +import sys,os,logging +import sqlalchemy as sal +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import Column, Integer, String, Sequence, Enum +from sqlalchemy.orm import sessionmaker +from sqlalchemy.dialects import mysql +from hashlib import md5 + +class assetDB(object): + + asset_types = ['wav','mpeg/320kbps','mpeg/64kbps'] + extensions = ['wav','mp3','mp3'] + ext = dict(zip(asset_types,extensions)) + + def __init__(self, prefix, pref=list(), config=None): + self.log = logging.getLogger('spark_feat_extract') + self.log.info("ORM Version: %s",sal.__version__) + self.config = config + self.session = None + self.Assets = None + self.prefix = prefix + if pref : + self.asset_prefs = pref + else : + self.asset_prefs = assetDB.asset_types + # reporting errors: + self.found_different_asset_type = 0 + self.errata_file = None + if config and hasattr(config,"db_errata_file") : + self.errata_file = config.db_errata_file + + + def connect(self,echo=False): + '''Connect to the MySQL database and create a session.''' + URL = "mysql://%s:%s@%s/%s" %(self.config.get('Commercial Asset Database', 'user'),self.config.get('Commercial Asset Database', 'passwd'),self.config.get('Commercial Asset Database', 'host'),self.config.get('Commercial Asset Database', 'name')) + self.log.info("Connecting to database server at: %s",URL.replace(self.config.get('Commercial Asset Database', 'passwd'),'*****')) + engine=sal.create_engine(URL, echo=echo) + Session = sessionmaker(bind=engine) + self.session = Session() + self.log.debug("MySQL session created successfully.") + return self + + def close(self): + '''Close the database session''' + if self.session : + self.session.close() + self.log.info("Database closed.") + return self + + def create_mapper(self): + '''Create an Object-Relational Mapper''' + Base = declarative_base() + class Assets(Base): + #change + #__tablename__ = 'assets' + __tablename__ = self.config.get('Commercial Asset Database', 'tablename') + # map all table columns to variables here, e.g. + # album_id = Column(Integer, primary_key=True) + # song_title = Column(String) + # genre_id = Column(Integer) + self.Assets = Assets + return self + + def get_assets(self,start=0,limit=10,asset_type='audio/x-wav'): + '''Returns some assets from the database. + If the path given by the specified asset type does not exists, + try to find the assets given the preference list provided in self.asset_prefs. + If no valid path can be found for an asset, log the error and yield None for the path.''' + # limit = start + limit # this changes the semantics of the SQL limit + + # create the ORM mapper object if doesn't exist + if self.Assets == None : + self.create_mapper() + + # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found + for asset in self.session.query(self.Assets)[start:limit]: + path = self.generate_path(asset,asset_type) + if self.validate_path(path) and self.validate_size(path,asset_type): + yield path,asset + elif not self.asset_prefs : + self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) + yield None,asset + else : + #change + self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) + self.log.warning("Trying other asset types.") + path = self.find_preferred_asset_path(asset) + if path == None : + yield None,asset + else : + yield path,asset + # ensure each asset yields only once + pass + pass + + def get_assets_by_genre(self,genre_id,start=0,limit=10,asset_type='audio/x-wav'): + '''Returns some assets of the given genre_id from the database. + If the path given by the specified asset type does not exists, + try to find the assets given the preference list provided in self.asset_prefs. + If no valid path can be found for an asset, log the error and yield None for the path.''' + # limit = start + limit # this changes the semantics of the SQL limit + + # create the ORM mapper object if doesn't exist + if self.Assets == None : + self.create_mapper() + + # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found + for asset in self.session.query(self.Assets).filter(self.Assets.genre_id == genre_id).all()[start:limit]: + path = self.generate_path(asset,asset_type) + if self.validate_path(path) and self.validate_size(path,asset_type): + yield path,asset + elif not self.asset_prefs : + self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) + yield None,asset + else : + #change + self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id) + self.log.warning("Trying other asset types.") + path = self.find_preferred_asset_path(asset) + if path == None : + yield None,asset + else : + yield path,asset + # ensure each asset yields only once + pass + pass + + def find_preferred_asset_path(self,asset): + '''Iteratively find a path name for each asset type in asset_prefs and return the first one available. + Return None if not found and log this event for error management.''' + path = unicode() + for asset_type in self.asset_prefs : + path = self.generate_path(asset,asset_type) + if self.validate_path(path): + self.log.info("Asset found but type is different from requested: %s. (Album ID: %i) ",asset.song_title,asset.album_id) + self.append_db_errata(path,"Found different asset type for problem case. (%s)"%asset_type) + self.found_different_asset_type += 1 + if self.validate_size(path,asset_type): + return path + else : + self.log.error("Requested file for asset is worng size, probably corrupt: %s. (Album ID: %i)",asset.song_title,asset.album_id) + continue + else: + self.append_db_errata(path,"File not found.") + if len(path) == 0 : + self.log.warning("Asset not found for: %s. (Album ID: %i)",asset.song_title,asset.album_id) + return None + + def generate_path(self,asset,asset_type): + '''Generate the path name given a asset database object and a requested asset type''' + path = '' # need to generate audio file path here + return path + + def validate_path(self,path): + '''Validate the generated path name.''' + return os.path.isfile(path) + + def validate_size(self,path,asset_type): + '''Check if the file size makes sense.''' + size = -1 + try : + size = int(os.path.getsize(path)) + except Exception, e: + self.append_db_errata(path,"Unable to determine file size.") + self.log.error("Unable to determine file size: %s." %path) + self.log.error("Exception %s."%str(e)) + return False + if size == 0 : + self.append_db_errata(path,"File has zero size.") + self.log.error("File has zero size: %s."%path) + return False + if 'wav' in asset_type : + # rationale: with very small files some feature extractor plugins fail or output junk + if size > 209715200 or size < 209715 : + self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0)) + return False + if 'mpeg' in asset_type : + # same assuming about 1:10 compression + if size > 41943040 or size < 65536 : + self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0)) + return False + return True + + def get_different_asset_no(self): + '''Return a count of the cases where the preferred asset type was not found''' + return self.found_different_asset_type + + def reset_different_asset_no(self): + '''Reset the asset type was not found counter''' + self.found_different_asset_type = 0 + + def append_db_errata(self,filename,reason,metadata=""): + '''Append to a file collecting assets present in the DB but not found on disk.''' + if not self.errata_file : return False + try : + with open(self.errata_file,"a+") as ef: + if metadata : + ef.write("%(filename)s,%(reason)s,%(metadata)s\n"%locals()) + else: + ef.write("%(filename)s,%(reason)s\n"%locals()) + except: + self.log.error("Failed to append database errata.") +