Mercurial > hg > plosone_underreview
view util_filter_dataset.py @ 0:ec9f7bfd63f6
initial commit
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 01 Sep 2017 18:45:31 +0300 |
parents | |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Mon Apr 10 17:44:48 2017 @author: mariapanteli """ import os import numpy as np import pandas as pd def get_speech_vamp(df): jspeech = df.columns.get_loc("Speech") nfiles = len(df) speechinds = [] for i in range(nfiles): #print i if os.path.exists(df.iat[i, jspeech]) and os.path.getsize(df.iat[i, jspeech])>0: bounds = pd.read_csv(df.iat[i, jspeech], header=None, delimiter='\t').get_values() if len(bounds)>0: if len(np.where(bounds[:,2]=='m')[0])==0 or len(np.where(bounds[:,2]=='s')[0])==len(bounds): speechinds.append(i) return speechinds def get_speech_meta(df): genres = np.array(df["Genre_Album"].get_values(), dtype=str) speechinds_genre = [] invalid_genres = ["Spoken Word", "Language Instruction", "Classical", "Poetry", "Nature|Sounds", "Music Instruction", "Soundtracks &", "Contemporary &", "Jazz &", "Sounds", "Ragtime", "Nature", "Electronic", "African American Spoken", "Blues", "Gospel", "Psychology &"] for i in range(len(genres)): genre = genres[i] #if genre in invalid_genres: if any(x in genre for x in invalid_genres): speechinds_genre.append(i) return speechinds_genre def get_missing_csv(df): nfiles = len(df) missing_csv = [] for i in range(nfiles): if not (os.path.exists(df["Melspec"].iloc[i]) and os.path.exists(df["Chroma"].iloc[i]) and os.path.exists(df["Melodia"].iloc[i])): missing_csv.append(i) return missing_csv def get_missing_country_meta(df): nfiles = len(df) missing_country = [] country_labels = np.array(df['Country'].get_values(), dtype=str) invalid_countries = ['Unidentified', 'unknown', 'nan', 'Yugoslavia (former)', 'Pathian village Wangulei ', 'Joulouloum either Senegal or The Gambia '] for i in range(nfiles): country = country_labels[i] if country in invalid_countries: missing_country.append(i) return missing_country def remove_missing_data(df): speechinds_vamp = get_speech_vamp(df) speechinds_genre = get_speech_meta(df) speechinds = set(speechinds_vamp) | set(speechinds_genre) missing = set(get_missing_csv(df)) missing_country = set(get_missing_country_meta(df)) selectinds = np.asarray(list(set(range(len(df))) - (missing | speechinds | missing_country))) df = df.iloc[selectinds, :] return df