diff pyspark/ilm/assetDB.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyspark/ilm/assetDB.py	Sat Feb 20 18:14:24 2016 +0100
@@ -0,0 +1,229 @@
+# Part of DML (Digital Music Laboratory)
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+assetDB.py
+
+Created by George Fazekas on 2012-01-16. Modifications by Mathieu Barthet in 2013-12,
+Steven Hargreaves 22/12/2014.
+Copyright (c) 2013 . All rights reserved.
+"""
+
+import sys,os,logging
+import sqlalchemy as sal
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Integer, String, Sequence, Enum
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.dialects import mysql
+from hashlib import md5
+
+class assetDB(object):
+
+    asset_types = ['wav','mpeg/320kbps','mpeg/64kbps']
+    extensions = ['wav','mp3','mp3']
+    ext = dict(zip(asset_types,extensions))
+
+    def __init__(self, prefix, pref=list(), config=None):
+        self.log = logging.getLogger('spark_feat_extract')
+        self.log.info("ORM Version: %s",sal.__version__)
+        self.config = config
+        self.session = None
+        self.Assets = None
+        self.prefix = prefix
+        if pref :
+            self.asset_prefs = pref
+        else :
+            self.asset_prefs = assetDB.asset_types
+        # reporting errors:
+        self.found_different_asset_type = 0
+        self.errata_file = None
+        if config and hasattr(config,"db_errata_file") :
+            self.errata_file = config.db_errata_file
+
+
+    def connect(self,echo=False):
+        '''Connect to the MySQL database and create a session.'''
+        URL = "mysql://%s:%s@%s/%s" %(self.config.get('Commercial Asset Database', 'user'),self.config.get('Commercial Asset Database', 'passwd'),self.config.get('Commercial Asset Database', 'host'),self.config.get('Commercial Asset Database', 'name'))
+        self.log.info("Connecting to database server at: %s",URL.replace(self.config.get('Commercial Asset Database', 'passwd'),'*****'))
+        engine=sal.create_engine(URL, echo=echo)
+        Session = sessionmaker(bind=engine)
+        self.session = Session()
+        self.log.debug("MySQL session created successfully.")
+        return self
+
+    def close(self):
+        '''Close the database session'''
+        if self.session :
+            self.session.close()
+            self.log.info("Database closed.")
+        return self
+
+    def create_mapper(self):
+        '''Create an Object-Relational Mapper'''
+        Base = declarative_base()
+        class Assets(Base):
+            #change
+            #__tablename__ = 'assets'
+            __tablename__ = self.config.get('Commercial Asset Database', 'tablename')
+            # map all table columns to variables here, e.g.
+            # album_id = Column(Integer, primary_key=True)
+            # song_title = Column(String)
+            # genre_id = Column(Integer)
+        self.Assets = Assets
+        return self
+
+    def get_assets(self,start=0,limit=10,asset_type='audio/x-wav'):
+        '''Returns some assets from the database.
+        If the path given by the specified asset type does not exists, 
+        try to find the assets given the preference list provided in self.asset_prefs.
+        If no valid path can be found for an asset, log the error and yield None for the path.'''
+        # limit = start + limit # this changes the semantics of the SQL limit
+
+        # create the ORM mapper object if doesn't exist
+        if self.Assets == None :
+            self.create_mapper()
+
+        # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
+        for asset in self.session.query(self.Assets)[start:limit]:
+            path = self.generate_path(asset,asset_type)
+            if self.validate_path(path) and self.validate_size(path,asset_type):
+                yield path,asset
+            elif not self.asset_prefs :
+                self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
+                yield None,asset
+            else :
+                #change
+                self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
+                self.log.warning("Trying other asset types.")
+                path = self.find_preferred_asset_path(asset)
+                if path == None :
+                    yield None,asset
+                else :
+                    yield path,asset
+            # ensure each asset yields only once
+            pass
+        pass
+
+    def get_assets_by_genre(self,genre_id,start=0,limit=10,asset_type='audio/x-wav'):
+        '''Returns some assets of the given genre_id from the database.
+        If the path given by the specified asset type does not exists, 
+        try to find the assets given the preference list provided in self.asset_prefs.
+        If no valid path can be found for an asset, log the error and yield None for the path.'''
+        # limit = start + limit # this changes the semantics of the SQL limit
+
+        # create the ORM mapper object if doesn't exist
+        if self.Assets == None :
+            self.create_mapper()
+
+        # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
+        for asset in self.session.query(self.Assets).filter(self.Assets.genre_id == genre_id).all()[start:limit]:
+            path = self.generate_path(asset,asset_type)
+            if self.validate_path(path) and self.validate_size(path,asset_type):
+                yield path,asset
+            elif not self.asset_prefs :
+                self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
+                yield None,asset
+            else :
+                #change
+                self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
+                self.log.warning("Trying other asset types.")
+                path = self.find_preferred_asset_path(asset)
+                if path == None :
+                    yield None,asset
+                else :
+                    yield path,asset
+            # ensure each asset yields only once
+            pass
+        pass
+
+    def find_preferred_asset_path(self,asset):
+        '''Iteratively find a path name for each asset type in asset_prefs and return the first one available. 
+        Return None if not found and log this event for error management.'''
+        path = unicode()
+        for asset_type in self.asset_prefs :
+            path = self.generate_path(asset,asset_type)
+            if self.validate_path(path):
+                self.log.info("Asset found but type is different from requested: %s. (Album ID: %i) ",asset.song_title,asset.album_id)
+                self.append_db_errata(path,"Found different asset type for problem case. (%s)"%asset_type)
+                self.found_different_asset_type += 1
+                if self.validate_size(path,asset_type):
+                    return path
+                else :
+                    self.log.error("Requested file for asset is worng size, probably corrupt: %s. (Album ID: %i)",asset.song_title,asset.album_id)
+                    continue
+            else:
+                self.append_db_errata(path,"File not found.")
+        if len(path) == 0 :
+            self.log.warning("Asset not found for: %s. (Album ID: %i)",asset.song_title,asset.album_id)
+            return None
+
+    def generate_path(self,asset,asset_type):
+        '''Generate the path name given a asset database object and a requested asset type'''
+        path = '' # need to generate audio file path here
+        return path
+
+    def validate_path(self,path):
+        '''Validate the generated path name.'''
+        return os.path.isfile(path)
+
+    def validate_size(self,path,asset_type):
+        '''Check if the file size makes sense.'''
+        size = -1
+        try :
+            size = int(os.path.getsize(path))
+        except Exception, e:
+            self.append_db_errata(path,"Unable to determine file size.")
+            self.log.error("Unable to determine file size: %s." %path)
+            self.log.error("Exception %s."%str(e))
+            return False
+        if size == 0 :
+            self.append_db_errata(path,"File has zero size.")
+            self.log.error("File has zero size: %s."%path)
+            return False
+        if 'wav' in asset_type :
+            # rationale: with very small files some feature extractor plugins fail or output junk
+            if size > 209715200 or size < 209715 :
+                self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
+                return False
+        if 'mpeg' in asset_type :
+            # same assuming about 1:10 compression
+            if size > 41943040 or size < 65536 :
+                self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
+                return False
+        return True
+
+    def get_different_asset_no(self):
+        '''Return a count of the cases where the preferred asset type was not found'''
+        return self.found_different_asset_type
+
+    def reset_different_asset_no(self):
+        '''Reset the asset type was not found counter'''
+        self.found_different_asset_type = 0
+
+    def append_db_errata(self,filename,reason,metadata=""):
+        '''Append to a file collecting assets present in the DB but not found on disk.'''
+        if not self.errata_file : return False
+        try :
+            with open(self.errata_file,"a+") as ef:
+                if metadata :
+                    ef.write("%(filename)s,%(reason)s,%(metadata)s\n"%locals())
+                else:
+                    ef.write("%(filename)s,%(reason)s\n"%locals())
+        except:
+            self.log.error("Failed to append database errata.")
+