view pyspark/ilm/assetDB.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line source
# Part of DML (Digital Music Laboratory)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

#!/usr/bin/env python
# encoding: utf-8
"""
assetDB.py

Created by George Fazekas on 2012-01-16. Modifications by Mathieu Barthet in 2013-12,
Steven Hargreaves 22/12/2014.
Copyright (c) 2013 . All rights reserved.
"""

import sys,os,logging
import sqlalchemy as sal
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Sequence, Enum
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects import mysql
from hashlib import md5

class assetDB(object):

    asset_types = ['wav','mpeg/320kbps','mpeg/64kbps']
    extensions = ['wav','mp3','mp3']
    ext = dict(zip(asset_types,extensions))

    def __init__(self, prefix, pref=list(), config=None):
        self.log = logging.getLogger('spark_feat_extract')
        self.log.info("ORM Version: %s",sal.__version__)
        self.config = config
        self.session = None
        self.Assets = None
        self.prefix = prefix
        if pref :
            self.asset_prefs = pref
        else :
            self.asset_prefs = assetDB.asset_types
        # reporting errors:
        self.found_different_asset_type = 0
        self.errata_file = None
        if config and hasattr(config,"db_errata_file") :
            self.errata_file = config.db_errata_file


    def connect(self,echo=False):
        '''Connect to the MySQL database and create a session.'''
        URL = "mysql://%s:%s@%s/%s" %(self.config.get('Commercial Asset Database', 'user'),self.config.get('Commercial Asset Database', 'passwd'),self.config.get('Commercial Asset Database', 'host'),self.config.get('Commercial Asset Database', 'name'))
        self.log.info("Connecting to database server at: %s",URL.replace(self.config.get('Commercial Asset Database', 'passwd'),'*****'))
        engine=sal.create_engine(URL, echo=echo)
        Session = sessionmaker(bind=engine)
        self.session = Session()
        self.log.debug("MySQL session created successfully.")
        return self

    def close(self):
        '''Close the database session'''
        if self.session :
            self.session.close()
            self.log.info("Database closed.")
        return self

    def create_mapper(self):
        '''Create an Object-Relational Mapper'''
        Base = declarative_base()
        class Assets(Base):
            #change
            #__tablename__ = 'assets'
            __tablename__ = self.config.get('Commercial Asset Database', 'tablename')
            # map all table columns to variables here, e.g.
            # album_id = Column(Integer, primary_key=True)
            # song_title = Column(String)
            # genre_id = Column(Integer)
        self.Assets = Assets
        return self

    def get_assets(self,start=0,limit=10,asset_type='audio/x-wav'):
        '''Returns some assets from the database.
        If the path given by the specified asset type does not exists, 
        try to find the assets given the preference list provided in self.asset_prefs.
        If no valid path can be found for an asset, log the error and yield None for the path.'''
        # limit = start + limit # this changes the semantics of the SQL limit

        # create the ORM mapper object if doesn't exist
        if self.Assets == None :
            self.create_mapper()

        # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
        for asset in self.session.query(self.Assets)[start:limit]:
            path = self.generate_path(asset,asset_type)
            if self.validate_path(path) and self.validate_size(path,asset_type):
                yield path,asset
            elif not self.asset_prefs :
                self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
                yield None,asset
            else :
                #change
                self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
                self.log.warning("Trying other asset types.")
                path = self.find_preferred_asset_path(asset)
                if path == None :
                    yield None,asset
                else :
                    yield path,asset
            # ensure each asset yields only once
            pass
        pass

    def get_assets_by_genre(self,genre_id,start=0,limit=10,asset_type='audio/x-wav'):
        '''Returns some assets of the given genre_id from the database.
        If the path given by the specified asset type does not exists, 
        try to find the assets given the preference list provided in self.asset_prefs.
        If no valid path can be found for an asset, log the error and yield None for the path.'''
        # limit = start + limit # this changes the semantics of the SQL limit

        # create the ORM mapper object if doesn't exist
        if self.Assets == None :
            self.create_mapper()

        # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
        for asset in self.session.query(self.Assets).filter(self.Assets.genre_id == genre_id).all()[start:limit]:
            path = self.generate_path(asset,asset_type)
            if self.validate_path(path) and self.validate_size(path,asset_type):
                yield path,asset
            elif not self.asset_prefs :
                self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
                yield None,asset
            else :
                #change
                self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
                self.log.warning("Trying other asset types.")
                path = self.find_preferred_asset_path(asset)
                if path == None :
                    yield None,asset
                else :
                    yield path,asset
            # ensure each asset yields only once
            pass
        pass

    def find_preferred_asset_path(self,asset):
        '''Iteratively find a path name for each asset type in asset_prefs and return the first one available. 
        Return None if not found and log this event for error management.'''
        path = unicode()
        for asset_type in self.asset_prefs :
            path = self.generate_path(asset,asset_type)
            if self.validate_path(path):
                self.log.info("Asset found but type is different from requested: %s. (Album ID: %i) ",asset.song_title,asset.album_id)
                self.append_db_errata(path,"Found different asset type for problem case. (%s)"%asset_type)
                self.found_different_asset_type += 1
                if self.validate_size(path,asset_type):
                    return path
                else :
                    self.log.error("Requested file for asset is worng size, probably corrupt: %s. (Album ID: %i)",asset.song_title,asset.album_id)
                    continue
            else:
                self.append_db_errata(path,"File not found.")
        if len(path) == 0 :
            self.log.warning("Asset not found for: %s. (Album ID: %i)",asset.song_title,asset.album_id)
            return None

    def generate_path(self,asset,asset_type):
        '''Generate the path name given a asset database object and a requested asset type'''
        path = '' # need to generate audio file path here
        return path

    def validate_path(self,path):
        '''Validate the generated path name.'''
        return os.path.isfile(path)

    def validate_size(self,path,asset_type):
        '''Check if the file size makes sense.'''
        size = -1
        try :
            size = int(os.path.getsize(path))
        except Exception, e:
            self.append_db_errata(path,"Unable to determine file size.")
            self.log.error("Unable to determine file size: %s." %path)
            self.log.error("Exception %s."%str(e))
            return False
        if size == 0 :
            self.append_db_errata(path,"File has zero size.")
            self.log.error("File has zero size: %s."%path)
            return False
        if 'wav' in asset_type :
            # rationale: with very small files some feature extractor plugins fail or output junk
            if size > 209715200 or size < 209715 :
                self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
                return False
        if 'mpeg' in asset_type :
            # same assuming about 1:10 compression
            if size > 41943040 or size < 65536 :
                self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
                return False
        return True

    def get_different_asset_no(self):
        '''Return a count of the cases where the preferred asset type was not found'''
        return self.found_different_asset_type

    def reset_different_asset_no(self):
        '''Reset the asset type was not found counter'''
        self.found_different_asset_type = 0

    def append_db_errata(self,filename,reason,metadata=""):
        '''Append to a file collecting assets present in the DB but not found on disk.'''
        if not self.errata_file : return False
        try :
            with open(self.errata_file,"a+") as ef:
                if metadata :
                    ef.write("%(filename)s,%(reason)s,%(metadata)s\n"%locals())
                else:
                    ef.write("%(filename)s,%(reason)s\n"%locals())
        except:
            self.log.error("Failed to append database errata.")