Commit 6bbf9afe authored by Jan Rudolf's avatar Jan Rudolf

ADD preprocessing

parent 2f0eef1a
......@@ -3,3 +3,4 @@ data
.idea
genres.tar.gz
*/.ipynb_checkpoints
input
......@@ -11,6 +11,10 @@ venv-activate:
get-dataset:
. ./bin/get-dataset.sh
preprocess: get-dataset
. ./venv/bin/activate && \
python3 audio_classification/preprocess.py
export-libs:
./venv/bin/pip3 freeze > requirements.txt
......
import glob
import os
import numpy as np
import pandas as pd
import librosa
DATASET_FOLDER = 'data'
DATASET_FILE = os.path.join(DATASET_FOLDER, 'input.mf')
OUTPUT_FOLDER = 'input'
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, 'data.csv')
DUMB_PREFIX = '/Users/sness/mirex2008/genres/'
def preprocess():
columns = ['file', 'label']
df = pd.read_csv(DATASET_FILE, sep='\t', header=None, names=columns)
length = len(df)
done = 0
output_index = list()
if not os.path.isdir(OUTPUT_FOLDER):
os.mkdir(OUTPUT_FOLDER)
for genre in df['label'].unique().tolist():
genre_folder = os.path.join(OUTPUT_FOLDER, genre)
if not os.path.isdir(genre_folder):
os.mkdir(genre_folder)
for index, row in df.iterrows():
dumb_absolute_path = row['file']
smart_suffix = dumb_absolute_path[len(DUMB_PREFIX):]
smart_relative_path = os.path.join(DATASET_FOLDER, smart_suffix)
smart_relative_output_path = os.path.join(OUTPUT_FOLDER, f'{smart_suffix[:-4]}.npy')
output_index.append((smart_relative_output_path, row['label']))
audio_file, sr = librosa.load(smart_relative_path)
y, _ = librosa.effects.trim(audio_file)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
np.save(smart_relative_output_path, mfccs)
done += 1
print(f'Preprocessing <{done}/{length}> ', end='\r')
print('\nDone')
output_df = pd.DataFrame(output_index, columns=columns, index=None)
output_df.to_csv(OUTPUT_FILE, header=False, index=False)
if __name__ == '__main__':
preprocess()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment