comparison pyspark/ilm/assetDB.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e34cf1b6fe09
1 # Part of DML (Digital Music Laboratory)
2 #
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public
14 # License along with this library; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17 #!/usr/bin/env python
18 # encoding: utf-8
19 """
20 assetDB.py
21
22 Created by George Fazekas on 2012-01-16. Modifications by Mathieu Barthet in 2013-12,
23 Steven Hargreaves 22/12/2014.
24 Copyright (c) 2013 . All rights reserved.
25 """
26
27 import sys,os,logging
28 import sqlalchemy as sal
29 from sqlalchemy.ext.declarative import declarative_base
30 from sqlalchemy import Column, Integer, String, Sequence, Enum
31 from sqlalchemy.orm import sessionmaker
32 from sqlalchemy.dialects import mysql
33 from hashlib import md5
34
35 class assetDB(object):
36
37 asset_types = ['wav','mpeg/320kbps','mpeg/64kbps']
38 extensions = ['wav','mp3','mp3']
39 ext = dict(zip(asset_types,extensions))
40
41 def __init__(self, prefix, pref=list(), config=None):
42 self.log = logging.getLogger('spark_feat_extract')
43 self.log.info("ORM Version: %s",sal.__version__)
44 self.config = config
45 self.session = None
46 self.Assets = None
47 self.prefix = prefix
48 if pref :
49 self.asset_prefs = pref
50 else :
51 self.asset_prefs = assetDB.asset_types
52 # reporting errors:
53 self.found_different_asset_type = 0
54 self.errata_file = None
55 if config and hasattr(config,"db_errata_file") :
56 self.errata_file = config.db_errata_file
57
58
59 def connect(self,echo=False):
60 '''Connect to the MySQL database and create a session.'''
61 URL = "mysql://%s:%s@%s/%s" %(self.config.get('Commercial Asset Database', 'user'),self.config.get('Commercial Asset Database', 'passwd'),self.config.get('Commercial Asset Database', 'host'),self.config.get('Commercial Asset Database', 'name'))
62 self.log.info("Connecting to database server at: %s",URL.replace(self.config.get('Commercial Asset Database', 'passwd'),'*****'))
63 engine=sal.create_engine(URL, echo=echo)
64 Session = sessionmaker(bind=engine)
65 self.session = Session()
66 self.log.debug("MySQL session created successfully.")
67 return self
68
69 def close(self):
70 '''Close the database session'''
71 if self.session :
72 self.session.close()
73 self.log.info("Database closed.")
74 return self
75
76 def create_mapper(self):
77 '''Create an Object-Relational Mapper'''
78 Base = declarative_base()
79 class Assets(Base):
80 #change
81 #__tablename__ = 'assets'
82 __tablename__ = self.config.get('Commercial Asset Database', 'tablename')
83 # map all table columns to variables here, e.g.
84 # album_id = Column(Integer, primary_key=True)
85 # song_title = Column(String)
86 # genre_id = Column(Integer)
87 self.Assets = Assets
88 return self
89
90 def get_assets(self,start=0,limit=10,asset_type='audio/x-wav'):
91 '''Returns some assets from the database.
92 If the path given by the specified asset type does not exists,
93 try to find the assets given the preference list provided in self.asset_prefs.
94 If no valid path can be found for an asset, log the error and yield None for the path.'''
95 # limit = start + limit # this changes the semantics of the SQL limit
96
97 # create the ORM mapper object if doesn't exist
98 if self.Assets == None :
99 self.create_mapper()
100
101 # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
102 for asset in self.session.query(self.Assets)[start:limit]:
103 path = self.generate_path(asset,asset_type)
104 if self.validate_path(path) and self.validate_size(path,asset_type):
105 yield path,asset
106 elif not self.asset_prefs :
107 self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
108 yield None,asset
109 else :
110 #change
111 self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
112 self.log.warning("Trying other asset types.")
113 path = self.find_preferred_asset_path(asset)
114 if path == None :
115 yield None,asset
116 else :
117 yield path,asset
118 # ensure each asset yields only once
119 pass
120 pass
121
122 def get_assets_by_genre(self,genre_id,start=0,limit=10,asset_type='audio/x-wav'):
123 '''Returns some assets of the given genre_id from the database.
124 If the path given by the specified asset type does not exists,
125 try to find the assets given the preference list provided in self.asset_prefs.
126 If no valid path can be found for an asset, log the error and yield None for the path.'''
127 # limit = start + limit # this changes the semantics of the SQL limit
128
129 # create the ORM mapper object if doesn't exist
130 if self.Assets == None :
131 self.create_mapper()
132
133 # generate an SQL query and for each asset in the results, yield a (validated) path name for the asset, or yield None if not found
134 for asset in self.session.query(self.Assets).filter(self.Assets.genre_id == genre_id).all()[start:limit]:
135 path = self.generate_path(asset,asset_type)
136 if self.validate_path(path) and self.validate_size(path,asset_type):
137 yield path,asset
138 elif not self.asset_prefs :
139 self.log.error("Requested file for asset not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
140 yield None,asset
141 else :
142 #change
143 self.log.warning("Requested file for asset bad or not found: %s. (Album ID: %i)",asset.song_title,asset.album_id)
144 self.log.warning("Trying other asset types.")
145 path = self.find_preferred_asset_path(asset)
146 if path == None :
147 yield None,asset
148 else :
149 yield path,asset
150 # ensure each asset yields only once
151 pass
152 pass
153
154 def find_preferred_asset_path(self,asset):
155 '''Iteratively find a path name for each asset type in asset_prefs and return the first one available.
156 Return None if not found and log this event for error management.'''
157 path = unicode()
158 for asset_type in self.asset_prefs :
159 path = self.generate_path(asset,asset_type)
160 if self.validate_path(path):
161 self.log.info("Asset found but type is different from requested: %s. (Album ID: %i) ",asset.song_title,asset.album_id)
162 self.append_db_errata(path,"Found different asset type for problem case. (%s)"%asset_type)
163 self.found_different_asset_type += 1
164 if self.validate_size(path,asset_type):
165 return path
166 else :
167 self.log.error("Requested file for asset is worng size, probably corrupt: %s. (Album ID: %i)",asset.song_title,asset.album_id)
168 continue
169 else:
170 self.append_db_errata(path,"File not found.")
171 if len(path) == 0 :
172 self.log.warning("Asset not found for: %s. (Album ID: %i)",asset.song_title,asset.album_id)
173 return None
174
175 def generate_path(self,asset,asset_type):
176 '''Generate the path name given a asset database object and a requested asset type'''
177 path = '' # need to generate audio file path here
178 return path
179
180 def validate_path(self,path):
181 '''Validate the generated path name.'''
182 return os.path.isfile(path)
183
184 def validate_size(self,path,asset_type):
185 '''Check if the file size makes sense.'''
186 size = -1
187 try :
188 size = int(os.path.getsize(path))
189 except Exception, e:
190 self.append_db_errata(path,"Unable to determine file size.")
191 self.log.error("Unable to determine file size: %s." %path)
192 self.log.error("Exception %s."%str(e))
193 return False
194 if size == 0 :
195 self.append_db_errata(path,"File has zero size.")
196 self.log.error("File has zero size: %s."%path)
197 return False
198 if 'wav' in asset_type :
199 # rationale: with very small files some feature extractor plugins fail or output junk
200 if size > 209715200 or size < 209715 :
201 self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
202 return False
203 if 'mpeg' in asset_type :
204 # same assuming about 1:10 compression
205 if size > 41943040 or size < 65536 :
206 self.append_db_errata(path,"Rejected file size is: %f KB" %(size/1024.0))
207 return False
208 return True
209
210 def get_different_asset_no(self):
211 '''Return a count of the cases where the preferred asset type was not found'''
212 return self.found_different_asset_type
213
214 def reset_different_asset_no(self):
215 '''Reset the asset type was not found counter'''
216 self.found_different_asset_type = 0
217
218 def append_db_errata(self,filename,reason,metadata=""):
219 '''Append to a file collecting assets present in the DB but not found on disk.'''
220 if not self.errata_file : return False
221 try :
222 with open(self.errata_file,"a+") as ef:
223 if metadata :
224 ef.write("%(filename)s,%(reason)s,%(metadata)s\n"%locals())
225 else:
226 ef.write("%(filename)s,%(reason)s\n"%locals())
227 except:
228 self.log.error("Failed to append database errata.")
229