annotate pyspark/ilm/assetDB.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
rev   line source
Daniel@0 1 # Part of DML (Digital Music Laboratory)
Daniel@0 2 #
Daniel@0 3 # This program is free software; you can redistribute it and/or
Daniel@0 4 # modify it under the terms of the GNU General Public License
Daniel@0 5 # as published by the Free Software Foundation; either version 2
Daniel@0 6 # of the License, or (at your option) any later version.
Daniel@0 7 #
Daniel@0 8 # This program is distributed in the hope that it will be useful,
Daniel@0 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0 11 # GNU General Public License for more details.
Daniel@0 12 #
Daniel@0 13 # You should have received a copy of the GNU General Public
Daniel@0 14 # License along with this library; if not, write to the Free Software
Daniel@0 15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0 16
Daniel@0 17 #!/usr/bin/env python
Daniel@0 18 # encoding: utf-8
Daniel@0 19 """
Daniel@0 20 assetDB.py
Daniel@0 21
Daniel@0 22 Created by George Fazekas on 2012-01-16. Modifications by Mathieu Barthet in 2013-12,
Daniel@0 23 Steven Hargreaves 22/12/2014.
Daniel@0 24 Copyright (c) 2013 . All rights reserved.
Daniel@0 25 """
Daniel@0 26
Daniel@0 27 import sys,os,logging
Daniel@0 28 import sqlalchemy as sal
Daniel@0 29 from sqlalchemy.ext.declarative import declarative_base
Daniel@0 30 from sqlalchemy import Column, Integer, String, Sequence, Enum
Daniel@0 31 from sqlalchemy.orm import sessionmaker
Daniel@0 32 from sqlalchemy.dialects import mysql
Daniel@0 33 from hashlib import md5
Daniel@0 34
Daniel@0 35 class assetDB(object):
Daniel@0 36
Daniel@0 37 asset_types = ['wav','mpeg/320kbps','mpeg/64kbps']
Daniel@0 38 extensions = ['wav','mp3','mp3']
Daniel@0 39 ext = dict(zip(asset_types,extensions))
Daniel@0 40
Daniel@0 41 def __init__(self, prefix, pref=list(), config=None):
Daniel@0 42 self.log = logging.getLogger('spark_feat_extract')
Daniel@0 43 self.log.info("ORM Version: %s",sal.__version__)
Daniel@0 44 self.config = config
Daniel@0 45 self.session = None
Daniel@0 46 self.Assets = None
Daniel@0 47 self.prefix = prefix
Daniel@0 48 if pref :
Daniel@0 49 self.asset_prefs = pref
Daniel@0 50 else :
Daniel@0 51 self.asset_prefs = assetDB.asset_types
Daniel@0 52 # reporting errors:
Daniel@0 53 self.found_different_asset_type = 0
Daniel@0 54 self.errata_file = None
Daniel@0 55 if config and hasattr(config,"db_errata_file") :
Daniel@0 56 self.errata_file = config.db_errata_file
Daniel@0 57
Daniel@0 58
Daniel@0 59 def connect(self,echo=False):
Daniel@0 60 '''Connect to the MySQL database and create a session.'''
Daniel@0 61 URL = "mysql://%s:%s@%s/%s" %(self.config.get('Commercial Asset Database', 'user'),self.config.get('Commercial Asset Database', 'passwd'),self.config.get('Commercial Asset Database', 'host'),self.config.get('Commercial Asset Database', 'name'))
Daniel@0 62 self.log.info("Connecting to database server at: %s",URL.replace(self.config.get('Commercial Asset Database', 'passwd'),'*****'))
Daniel@0 63 engine=sal.create_engine(URL, echo=echo)
Daniel@0 64 Session = sessionmaker(bind=engine)
Daniel@0 65 self.session = Session()
Daniel@0 66 self.log.debug("MySQL session created successfully.")
Daniel@0 67 return self
Daniel@0 68
Daniel@0 69 def close(self):
Daniel@0 70 '''Close the database session'''
Daniel@0 71 if self.session :
Daniel@0 72 self.session.close()
Daniel@0 73 self.log.info("Database closed.")
Daniel@0 74 return self
Daniel@0 75
Daniel@0 76 def create_mapper(self):
Daniel@0 77 '''Create an Object-Relational Mapper'''
Daniel@0 78 Base = declarative_base()
Daniel@0 79 class Assets(Base):
Daniel@0 80 #change
Daniel@0 81 #__tablename__ = 'assets'
Daniel@0 82 __tablename__ = self.config.get('Commercial Asset Database', 'tablename')
Daniel@0 83 # map all table columns to variables here, e.g.
Daniel@0 84 # album_id = Column(Integer, primary_key=True)
Daniel@0 85 # song_title = Column(String)
Daniel@0 86 # genre_id = Column(Integer)
Daniel@0 87 self.Assets = Assets
Daniel@0 88 return self
Daniel@0 89
Daniel@0 90 def get_assets(self,start=0,limit=10,asset_type='audio/x-wav'):
Daniel@0 91 '''Returns some assets from the database.
Daniel@0 92 If the path given by the specified asset type does not exists,
Daniel@0 93 try to find the assets given the preference list provided in self.asset_prefs.
Daniel@0 94 If no valid path can be found for an asset, log the error and yield None for the path.'''
Daniel@0 95 # limit = start + limit # this changes the semantics of the SQL limit
Daniel@0 96
Daniel@0 97 # create the ORM mapper object if doesn't exist
Daniel@0 98 if self.Assets == None :
Daniel@0 99 self.create_mapper()
Daniel@0 100
Daniel@0 101 # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
Daniel@0 102 for asset in self.session.query(self.Assets)[start:limit]:
Daniel@0 103 path = self.generate_path(asset,asset_type)
Daniel@0 104 if self.validate_path(path) and self.validate_size(path,asset_type):
Daniel@0 105 yield path,asset
Daniel@0 106 elif not self.asset_prefs :
Daniel@0 107 self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
Daniel@0 108 yield None,asset
Daniel@0 109 else :
Daniel@0 110 #change
Daniel@0 111 self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
Daniel@0 112 self.log.warning("Trying other asset types.")
Daniel@0 113 path = self.find_preferred_asset_path(asset)
Daniel@0 114 if path == None :
Daniel@0 115 yield None,asset
Daniel@0 116 else :
Daniel@0 117 yield path,asset
Daniel@0 118 # ensure each asset yields only once
Daniel@0 119 pass
Daniel@0 120 pass
Daniel@0 121
Daniel@0 122 def get_assets_by_genre(self,genre_id,start=0,limit=10,asset_type='audio/x-wav'):
Daniel@0 123 '''Returns some assets of the given genre_id from the database.
Daniel@0 124 If the path given by the specified asset type does not exists,
Daniel@0 125 try to find the assets given the preference list provided in self.asset_prefs.
Daniel@0 126 If no valid path can be found for an asset, log the error and yield None for the path.'''
Daniel@0 127 # limit = start + limit # this changes the semantics of the SQL limit
Daniel@0 128
Daniel@0 129 # create the ORM mapper object if doesn't exist
Daniel@0 130 if self.Assets == None :
Daniel@0 131 self.create_mapper()
Daniel@0 132
Daniel@0 133 # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
Daniel@0 134 for asset in self.session.query(self.Assets).filter(self.Assets.genre_id == genre_id).all()[start:limit]:
Daniel@0 135 path = self.generate_path(asset,asset_type)
Daniel@0 136 if self.validate_path(path) and self.validate_size(path,asset_type):
Daniel@0 137 yield path,asset
Daniel@0 138 elif not self.asset_prefs :
Daniel@0 139 self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
Daniel@0 140 yield None,asset
Daniel@0 141 else :
Daniel@0 142 #change
Daniel@0 143 self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
Daniel@0 144 self.log.warning("Trying other asset types.")
Daniel@0 145 path = self.find_preferred_asset_path(asset)
Daniel@0 146 if path == None :
Daniel@0 147 yield None,asset
Daniel@0 148 else :
Daniel@0 149 yield path,asset
Daniel@0 150 # ensure each asset yields only once
Daniel@0 151 pass
Daniel@0 152 pass
Daniel@0 153
Daniel@0 154 def find_preferred_asset_path(self,asset):
Daniel@0 155 '''Iteratively find a path name for each asset type in asset_prefs and return the first one available.
Daniel@0 156 Return None if not found and log this event for error management.'''
Daniel@0 157 path = unicode()
Daniel@0 158 for asset_type in self.asset_prefs :
Daniel@0 159 path = self.generate_path(asset,asset_type)
Daniel@0 160 if self.validate_path(path):
Daniel@0 161 self.log.info("Asset found but type is different from requested: %s. (Album ID: %i) ",asset.song_title,asset.album_id)
Daniel@0 162 self.append_db_errata(path,"Found different asset type for problem case. (%s)"%asset_type)
Daniel@0 163 self.found_different_asset_type += 1
Daniel@0 164 if self.validate_size(path,asset_type):
Daniel@0 165 return path
Daniel@0 166 else :
Daniel@0 167 self.log.error("Requested file for asset is worng size, probably corrupt: %s. (Album ID: %i)",asset.song_title,asset.album_id)
Daniel@0 168 continue
Daniel@0 169 else:
Daniel@0 170 self.append_db_errata(path,"File not found.")
Daniel@0 171 if len(path) == 0 :
Daniel@0 172 self.log.warning("Asset not found for: %s. (Album ID: %i)",asset.song_title,asset.album_id)
Daniel@0 173 return None
Daniel@0 174
Daniel@0 175 def generate_path(self,asset,asset_type):
Daniel@0 176 '''Generate the path name given a asset database object and a requested asset type'''
Daniel@0 177 path = '' # need to generate audio file path here
Daniel@0 178 return path
Daniel@0 179
Daniel@0 180 def validate_path(self,path):
Daniel@0 181 '''Validate the generated path name.'''
Daniel@0 182 return os.path.isfile(path)
Daniel@0 183
Daniel@0 184 def validate_size(self,path,asset_type):
Daniel@0 185 '''Check if the file size makes sense.'''
Daniel@0 186 size = -1
Daniel@0 187 try :
Daniel@0 188 size = int(os.path.getsize(path))
Daniel@0 189 except Exception, e:
Daniel@0 190 self.append_db_errata(path,"Unable to determine file size.")
Daniel@0 191 self.log.error("Unable to determine file size: %s." %path)
Daniel@0 192 self.log.error("Exception %s."%str(e))
Daniel@0 193 return False
Daniel@0 194 if size == 0 :
Daniel@0 195 self.append_db_errata(path,"File has zero size.")
Daniel@0 196 self.log.error("File has zero size: %s."%path)
Daniel@0 197 return False
Daniel@0 198 if 'wav' in asset_type :
Daniel@0 199 # rationale: with very small files some feature extractor plugins fail or output junk
Daniel@0 200 if size > 209715200 or size < 209715 :
Daniel@0 201 self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
Daniel@0 202 return False
Daniel@0 203 if 'mpeg' in asset_type :
Daniel@0 204 # same assuming about 1:10 compression
Daniel@0 205 if size > 41943040 or size < 65536 :
Daniel@0 206 self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
Daniel@0 207 return False
Daniel@0 208 return True
Daniel@0 209
Daniel@0 210 def get_different_asset_no(self):
Daniel@0 211 '''Return a count of the cases where the preferred asset type was not found'''
Daniel@0 212 return self.found_different_asset_type
Daniel@0 213
Daniel@0 214 def reset_different_asset_no(self):
Daniel@0 215 '''Reset the asset type was not found counter'''
Daniel@0 216 self.found_different_asset_type = 0
Daniel@0 217
Daniel@0 218 def append_db_errata(self,filename,reason,metadata=""):
Daniel@0 219 '''Append to a file collecting assets present in the DB but not found on disk.'''
Daniel@0 220 if not self.errata_file : return False
Daniel@0 221 try :
Daniel@0 222 with open(self.errata_file,"a+") as ef:
Daniel@0 223 if metadata :
Daniel@0 224 ef.write("%(filename)s,%(reason)s,%(metadata)s\n"%locals())
Daniel@0 225 else:
Daniel@0 226 ef.write("%(filename)s,%(reason)s\n"%locals())
Daniel@0 227 except:
Daniel@0 228 self.log.error("Failed to append database errata.")
Daniel@0 229