Context Navigation

source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT_support.py@ 35990

Last change on this file since 35990 was 35990, checked in by davidb, 2 years ago
Changes after testing
Property svn:executable set to ``*
File size: 8.4 KB

Line
1	#!/usr/bin/env python3
2
3	import argparse
4
5	import os
6	import re
7	import shutil
8
9	from glob import glob
10
11	prep_dir = "essentia-audio-features"
12
13	import_dir = os.path.join("..","import")
14	audio_dir = os.path.join(prep_dir,"audio")
15
16
17	parser = argparse.ArgumentParser(description="Copy Essentia generated audio features into Greenstone3 collection's '../import' directory")
18	parser.add_argument('--startyear', type=int, default=None,
19	help='Start year range of the Eurovision Song Contest')
20	parser.add_argument('--endyear', type=int, default=None,
21	help='End year range of the Eurovision Song Contest')
22	args = parser.parse_args()
23
24	start_year = int(args.startyear) if args.startyear is not None else None
25	end_year = int(args.endyear) if args.endyear is not None else None
26
27
28	years = [d for d in os.listdir(audio_dir) if os.path.isdir(os.path.join(audio_dir, d))]
29
30	sorted_years = sorted(years)
31
32	opt_filtered_sorted_years = []
33
34	for y in sorted_years:
35
36	if (start_year is not None) and int(y) < start_year:
37	continue
38
39	if (end_year is not None) and int(y) > end_year:
40	continue
41
42	opt_filtered_sorted_years.append(y)
43
44
45	print("Reading in Audio-Feature JSON files by year")
46
47	all_af_json_files = []
48
49
50	for year in opt_filtered_sorted_years:
51	# print(" " + year)
52
53	if year == "2020":
54	print("====")
55	print("Skipping year 2020 as no contest was held that year")
56	print("====")
57	continue
58
59	audio_year_dir = os.path.join(audio_dir,year)
60
61	for f in os.listdir(audio_year_dir):
62	if re.search(r"\.json$",f):
63	year_entrant = os.path.join(year,f)
64	all_af_json_files.append(year_entrant)
65
66
67	#print("json files = " + repr(all_af_json_files))
68
69
70	entrant_recs = []
71
72
73	for year_and_entrant in all_af_json_files:
74
75	match = re.search(r"^(\d{4})/([^_]+)_([^_]+)_([^_]+)$",year_and_entrant)
76
77	if match:
78	year = match.group(1)
79	country = match.group(2)
80	title = match.group(3)
81	artist = match.group(4)
82
83	entrant_rec = { "year": year, "country": country, "title": title, "artist": artist }
84
85
86	# print("year={0}, country={1}, title={2}, artist={3}".format(year,country,title,artist))
87	# print("entrant = " + repr(entrant_rec))
88
89	entrant_recs.append(entrant_rec)
90
91	else:
92	print("**** No match found for: '" + year_and_entrant +"'")
93
94
95	#
96	# af = audio-features
97	#
98	all_af_nul_files = []
99	src_af_lookup = {}
100
101	#print()
102	#print("Nul files derived from year/country_title_artist audio feature JSON files:")
103
104	for entrant_rec in entrant_recs:
105	year = entrant_rec["year"]
106	country = entrant_rec["country"]
107	title = entrant_rec["title"]
108	artist = entrant_rec["artist"]
109
110	full_year_entrant = os.path.join(audio_dir,year,country+"_" + title + "_" + artist)
111
112	# Tweak the values so they align with those used in SPARQL resultsset
113	country = country.replace("&","and").replace(" ","")
114
115	if year == "1956":
116	title_label = entrant_rec["title"].lower().replace(" ","")
117	#title_first3 = title_label[0].upper() + title_label[1:3].lower()
118	title_first3 = title_label[0:3].upper()
119	year += title_first3
120
121	nul_file = country + year + ".nul"
122
123	all_af_nul_files.append(nul_file)
124	src_af_lookup[nul_file] = full_year_entrant
125	# print("**## storing nul_file = " + nul_file + " => " + full_year_entrant);
126
127	# print(", ".join(all_af_nul_files))
128
129
130	entrant_nul_dirs = []
131
132
133	for d in os.listdir(import_dir):
134	full_d = os.path.join(import_dir, d)
135	if os.path.isdir(full_d) and re.search(r"^(sparqlresults-local--countries-in-esc-by-year-.*)\|(missing-cat-countries)\|(inaugural-year)$",d):
136	entrant_nul_dirs.append(full_d)
137
138
139	#print("Entrant Nul dirs:")
140	#print(repr(entrant_nul_dirs))
141
142	entrant_nul_files_lookup = {}
143
144	for d in entrant_nul_dirs:
145
146	for f in os.listdir(d):
147	if re.search(r"\.nul$",f):
148	nul_file = f
149	full_nul_file = os.path.join(d,f)
150
151	if not nul_file in entrant_nul_files_lookup:
152	entrant_nul_files_lookup[nul_file] = []
153
154	entrant_nul_files_lookup[nul_file].append(full_nul_file)
155
156	#print()
157	#print("Lookup dict:")
158	#print(repr(entrant_nul_files_lookup))
159
160
161
162	# The country of Macedonia renamed itself to North Macedonia in response
163	# to a long running naming debate with the region of Macedonia in Greece
164	#
165	#
166	# Audio-Feature files sometimes refer to North Macedonia in places
167	# where the SPARQL results gives the country as Macedonia
168	#
169	# => Pass through the audio-feature data changing 'NorthMacedoniaYYYY.nul' to
170	# 'MacedoniaYYY.nul' for any place where there is no entrant_nul_files_lookup
171	# for 'NorthMacedonia'
172
173	print()
174	print("Running errata to better align Audio Feature derived .nul files with Sparql Resultset")
175	for i,af_nul_file in enumerate(all_af_nul_files):
176
177	if not af_nul_file in entrant_nul_files_lookup:
178	af_nul_file_errata = None
179
180	if af_nul_file == "Germany1956Imw.nul":
181	af_nul_file_errata = "Germany1956ImW.nul"
182
183	if af_nul_file == "Yugoslavia1992.nul":
184	af_nul_file_errata = "FRYugoslavia1992.nul"
185
186	elif re.search(r"^NorthMacedonia",af_nul_file):
187	af_nul_file_check = af_nul_file.replace("North","")
188	if af_nul_file_check in entrant_nul_files_lookup:
189	af_nul_file_errata = af_nul_file_check
190
191	if af_nul_file_errata is not None:
192	print(" Errata: Fixing " + af_nul_file + " => " + af_nul_file_errata)
193
194	src_af_lookup[af_nul_file_errata] = src_af_lookup[af_nul_file]
195	del src_af_lookup[af_nul_file]
196
197	all_af_nul_files[i] = af_nul_file_errata
198
199
200	print("Matching Audio-Feature files to nul files in 'import':")
201
202
203	print()
204	print("The following files have Audio Features, but did not match in to a .nul file in the 'import' diretory")
205
206	matched_af_nul_files = []
207
208	for af_nul_file in all_af_nul_files:
209	if af_nul_file in entrant_nul_files_lookup:
210	matched_af_nul_files.append(af_nul_file)
211	else:
212	print(" " + af_nul_file)
213
214
215	print()
216	print("Copying the following Essentia audio feature JSON files into .nul matched positions in 'import'")
217
218	for af_nul_file in matched_af_nul_files:
219	import_files = entrant_nul_files_lookup[af_nul_file]
220	for import_file in import_files:
221	# print("** looking up " + af_nul_file)
222	src_json_file = src_af_lookup[af_nul_file]
223	dst_json_file = import_file.replace(".nul","-Essentia.json")
224	dst_assoc_json_file = import_file.replace(".nul","-AssocEssentia.json")
225
226	print(" Copying: " + src_json_file + " => " + dst_json_file)
227	shutil.copy(src_json_file,dst_json_file)
228
229	print(" Copying: " + src_json_file + " => " + dst_assoc_json_file)
230	shutil.copy(src_json_file,dst_assoc_json_file)
231
232
233	# Sort out contestant CSV files to copy into ../import area
234
235	src_csv_files = []
236
237	if start_year is None and end_year is None:
238	# 1956 and 1956 - env.esc_cutoff_year
239	esc_cutoff_endyear = os.environ["esc_cutoff_endyear"]
240	#esc_cutoff_endyear = 2021
241
242	src_csv_files.append("contestants-1956.cvs")
243	src_csv_files.append(f"contestants-1957-to-{esc_cutoff_endyear}.cvs")
244
245	elif start_year is None :
246	src_csv_files.append(f"contestants-1956-to-{end_year}.cvs")
247
248	elif end_year is None :
249	src_csv_files.append(f"contestants-{start_year}-to-{esc_cutoff_endyear}.cvs")
250	else:
251	src_csv_files.append(f"contestants-{start_year}-to-{end_year}.cvs")
252
253
254	dst_csv_dirs = [os.path.join(import_dir,"inaugural-year"), os.path.join(import_dir,"missing-cat-countries") ]
255
256	sparql_result_dirs = glob(os.path.join(import_dir,"sparqlresults-local--countries-in-esc-by-year-*/"))
257	dst_csv_dirs += sparql_result_dirs
258
259
260	print()
261	print("Copying the following Contestants CSV files into 'import'")
262
263	for dst_dir in dst_csv_dirs:
264	for src_csv_file in src_csv_files:
265
266	src_csv_filename = os.path.join(prep_dir,src_csv_file)
267	dst_csv_filename = os.path.join(dst_dir,src_csv_file)
268
269	print(" Copying: " + src_csv_filename + " => " + dst_csv_filename)
270	shutil.copy(src_csv_filename,dst_filename)
271
272

Note: See TracBrowser for help on using the repository browser.

Download in other formats: