''' Thomas Matlak, Avi Vajpeyi, Avery Rapson CS 310 Final Project Given textfiles with the musical notes in int format, this creates a pickle of the attributes and classes for all the musical data stored in the text files (each text file is for one class). The data is stored as frequencies of each note on a keyboard, and the class label is stored in 'one hot' format. 10 pre cent of data present set aside as testing data. Usage: python createMusicalFeaturesets.py OUTPUT: notesData.pickle A pickle with the attributes and classes for music data pickle data continas: train_attribute ,train_class, test_attribute, test_class NOTE: Need to update the follwoing depending on usage of script ROOT_DIR = root/directrory/where/text/reside DataFile = ["emotion1.txt","emotion2.txt"...]) ''' from mido import MidiFile, MidiTrack, Message import mido import random import pickle from collections import Counter import numpy as np import os ''' Assume we have the following as our 'LEXICON' unique word list : [chair, table, spoon, television] Assume this is our current sample data: String: I pulled my chair up to the table Create a training vector that holds the count of each lexicon word: training vector : [1, 1, 0, 0] (since chair table are in string, but spoon TV arnt) Do this for all strings ''' ROOT_DIR = "TrainingData/" DataFile = ["NegExamples/sadSongs.txt","PosExamples/happySongs.txt"] pianoSize = 128 # notes 0 - 127 # this also defines our lexicon # larger dataset, more memory gets used up MemoryError def sample_handling(sample, classification): featureset = [] ''' featureset = [ [[0 1 0 0 1 0 0 ...], [1, 0]] [[0 1 0 0 1 1 1 ...], [0, 1]] .... ] so the first list is the array of matches with the lexicon the second is which classification the features falls into (yes or no) ''' with open(sample,'r') as f: contents = f.readlines() for l in contents: notes = np.fromstring(l, dtype=int, sep=' ') noteCount = np.zeros(pianoSize) for note in notes: noteCount[note] += 1 noteCount = list(noteCount) featureset.append([noteCount, classification]) return featureset def create_feature_sets_and_labels(DataFile,test_size = 0.1): features = [] features += sample_handling(ROOT_DIR+DataFile[0],[0,1])# neg features += sample_handling(ROOT_DIR+DataFile[1],[1,0]) # pos random.shuffle(features) ''' does tf.argmax([output]) == tf.argmax([expectations]) will look like: tf.argmax([55454, 342324]) == tf.argmax([1,0]) ''' features = np.array(features) testing_size = int(test_size*len(features)) train_x = list(features[:,0][:-testing_size]) #[[5,8],[7,9]] --> [:,0] does [5,7] (all of the 0 elememts) ie the labels in this case train_y = list(features[:,1][:-testing_size]) test_x = list(features[:,0][-testing_size:]) test_y = list(features[:,1][-testing_size:]) return train_x,train_y,test_x,test_y if __name__ == '__main__': train_x,train_y,test_x,test_y = create_feature_sets_and_labels(DataFile) with open('notesData.pickle','wb') as f: pickle.dump([train_x,train_y,test_x,test_y],f) # dump data as a list, into a file # this saves the lexicon for pos and neg words # every inputted value is converted to a lexicon saving this info # a lot of memory!