Mercurial > hg > dml-open-backendtools
comparison pyspark/test_timeside_vamp_spark_charm.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e34cf1b6fe09 |
---|---|
1 # Part of DML (Digital Music Laboratory) | |
2 # | |
3 # This program is free software; you can redistribute it and/or | |
4 # modify it under the terms of the GNU General Public License | |
5 # as published by the Free Software Foundation; either version 2 | |
6 # of the License, or (at your option) any later version. | |
7 # | |
8 # This program is distributed in the hope that it will be useful, | |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 # GNU General Public License for more details. | |
12 # | |
13 # You should have received a copy of the GNU General Public | |
14 # License along with this library; if not, write to the Free Software | |
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
16 | |
17 #!/usr/local/spark-1.0.0-bin-hadoop2/bin/spark-submit | |
18 # -*- coding: utf-8 -*- | |
19 __author__="wolffd" | |
20 __date__ ="$11-Jul-2014 15:31:01$" | |
21 | |
22 # How to run this? | |
23 | |
24 # to start hdfs: /usr/local/hadoop/sbin/start-dfs.sh | |
25 | |
26 # Running python applications through ./bin/pyspark is deprecated as of Spark 1.0. | |
27 # Use ./bin/spark-submit | |
28 # spark-submit test_timeside_vamp_spark_charm.py --py-files vamp_plugin_dml.py,timeside_vamp.py,decode_to_wav.py | |
29 | |
30 #import pydoop.hdfs as hdfs | |
31 from pyspark import SparkConf, SparkContext | |
32 # @todo: timeside has to be packed for multi-pc usage | |
33 import os.path | |
34 import os | |
35 import sys | |
36 from os import walk | |
37 # NOTE: this is only for debugging purposes, we can | |
38 # now use a regular timeside installation, e.g. installed by | |
39 sys.path.append(os.getcwd() + '/../TimeSide/') | |
40 | |
41 # mappers | |
42 from timeside_vamp import * | |
43 from decode_to_wav import * | |
44 | |
45 def main(): | |
46 print "PySpark Telemeta and Vamp Test on CHARM" | |
47 | |
48 # configure the Spark Setup | |
49 conf = (SparkConf() | |
50 .setMaster("spark://0.0.0.0:7077") | |
51 #.setMaster("local") | |
52 .setAppName("CharmVamp") | |
53 .set("spark.executor.memory", "1g")) | |
54 sc = SparkContext(conf = conf) | |
55 | |
56 # SMB Share | |
57 # mount.cifs //10.2.165.194/mirg /home/wolffd/wansteadshare -o username=dml,password=xxx,domain=ENTERPRISE") | |
58 | |
59 | |
60 # uses local paths | |
61 # get list of obkects to process | |
62 mypath = '/samples/' | |
63 data = [] | |
64 for (dirpath, dirnames, filenames) in walk(mypath): | |
65 for file in filenames: | |
66 if file.endswith(".wav") or file.endswith(".flac"): | |
67 data.append(os.path.join(dirpath, file)) | |
68 | |
69 data = data[0:2] | |
70 # HDFS | |
71 # note: for HDFS we need wrappers for VAMP and gstreamer :/ | |
72 # copy to hdfs (put in different file before) | |
73 #hdfs.mkdir("test") | |
74 #hdfs.chmod("test","o+rw") | |
75 ##this copies the test wavs to hdfs | |
76 #hdfs.put("samples/","test/") | |
77 # get hdfs paths | |
78 # data = [] | |
79 # filenames = hdfs.ls("hdfs://0.0.0.0:9000/user/hduser/test/samples") | |
80 # print filenames | |
81 # for file in filenames: | |
82 # if file[-4:]== ".wav" or file[-4:]==".flac": | |
83 # data.append(file) | |
84 # | |
85 # define distributed dataset | |
86 # todo: can we do this with the wav data itself? | |
87 distData = sc.parallelize(data) | |
88 | |
89 # define map that decodes to wav | |
90 m0 = distData.map(lambda x: decode_to_wav(source=x)) | |
91 | |
92 # define map that applies the vamp plugin | |
93 m1 = m0.map(lambda x: transform(wav_file=x)).collect() | |
94 print m1 | |
95 return m1 | |
96 #process 2 | |
97 #m1.take(2) | |
98 | |
99 if __name__ == "__main__": | |
100 main() | |
101 |