Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: other-projects/mirex/grand-challenge/generate-jamendo-dataset/scripts-2015/generate-evened-out-genre-csv.py@ 30005

Last change on this file since 30005 was 30005, checked in by davidb, 9 years ago
Initial cut at scripts for generating a 'boosted' ggenre set of files
Property svn:executable set to ``*
File size: 2.7 KB

Rev	Line
[30005]	1	#!/usr/bin/python
	2
	3	from os import listdir
	4	from os.path import isfile, join
	5
	6	import json
	7	import random
	8	import sys
	9
	10
	11
	12	def tabulate_results():
	13	print "===="
	14	print "Number of records processed " + str(jid_count)
	15	print "===="
	16
	17	ofd_freq = open(join(output_dir,"genre-tags-freq-10000.csv"), "w")
	18
	19	print "Generating ..."
	20	print " Genre Tags Frequency Distribution"
	21	print " (one track can be given assigned multiple genres)"
	22
	23	print >> ofd_freq, "Genre tag,Frequency count"
	24
	25	for gk in genre_table.keys():
	26	print >> ofd_freq, "{0:20}".format(gk) + "," + str(genre_table[gk])
	27
	28	ofd_freq.close();
	29
	30	ofd_distrib = open(join(output_dir, "genre-tags-distribution-count-10000.csv"), "w")
	31
	32	print " Number of Genre Tags Assigned per Track"
	33	print >> ofd_distrib, "Num of tags,Frequency count"
	34	for gk in sorted(genre_table_tagcount.keys()):
	35	print >> ofd_distrib, gk + "," + str(genre_table_tagcount[gk])
	36
	37	ofd_distrib.close();
	38
	39
	40
	41	def processTagset(jid,tagrec):
	42
	43	# genres, instruments, vartags
	44
	45	genres = tagrec[u'genres']
	46	num_genres = len(genres)
	47
	48	num_genres_str = str(num_genres)
	49	if not genre_table_tagcount.has_key(num_genres_str):
	50	genre_table_tagcount[num_genres_str] = 0
	51	genre_table_tagcount[num_genres_str] += 1
	52
	53	for g in genres:
	54	if not genre_table.has_key(g):
	55	genre_table[g] = 0
	56	genre_table[g] += 1
	57
	58
	59
	60	argc = len(sys.argv)
	61
	62	input_dir = sys.argv[1] if (argc==2) or (argc==3) else "download-json-all"
	63	output_dir = sys.argv[2] if argc==3 else "csv-data"
	64
	65	json_files = [ jf for jf in listdir(input_dir) if isfile(join(input_dir,jf)) ]
	66
	67	jamendo_ids = [];
	68
	69	genre_table = {}
	70	genre_table_tagcount = {}
	71
	72	jid_count = 0
	73
	74	evened_out_json_data=open('jamendo-evened-out-dataset-trackids.json').read()
	75	evened_out_jamendo_ids=json.loads(evened_out_json_data)
	76
	77	evened_out_hashmap = {}
	78	for jid in evened_out_jamendo_ids:
	79	evened_out_hashmap[jid] = True
	80
	81	print "Evened out dataset size = " + str(len(evened_out_hashmap))
	82
	83	all_hashmap = {}
	84
	85	for jf in json_files:
	86	print "Processing" + jf
	87
	88	json_filename = join(input_dir,jf)
	89	ifd = open(json_filename)
	90	json_data = ifd.read()
	91	data = json.loads(json_data)
	92
	93	# if jid_count>100:
	94	# break
	95
	96	for rec in data[u'results']:
	97
	98	jid = rec[u'id']
	99	all_hashmap[jid] = True
	100
	101	jid_count += 1
	102
	103	musicinfo = rec[u'musicinfo']
	104	tags = musicinfo[u'tags']
	105	if (evened_out_hashmap.get(jid) != None):
	106	processTagset(jid,tags)
	107
	108
	109	ifd.close()
	110
	111
	112	tabulate_results()
	113
	114	print "Checking for consitency!"
	115
	116	for jid in evened_out_jamendo_ids:
	117	if all_hashmap.get(jid) == None:
	118	print "**** Not found in all jid map:" + str(jid)
	119
	120
	121
	122
	123
	124

Note: See TracBrowser for help on using the repository browser.

Download in other formats: