# TODO
# Check to see if Song/Title coming back with quotes around it
# In addition to Languages->Language, split of "," ??
#   (e.g., 2005 has "Language" but entries like English, Spanish)


from __future__ import print_function

import json
import os
import re
import requests

import argparse
import bs4
import wikipedia

DEBUG=False


cache_dir_=None

def init_cache(cache_dir):
    global cache_dir_
    cache_dir_ = cache_dir
    if not(os.path.exists(cache_dir_)):
        print("Making cache directory: " + cache_dir_)
        os.mkdir(cache_dir_)
    

def read_text_file(input_filename):
    f = open(input_filename, "r")
    text = f.read()
    f.close()

    return text

def write_text_file(output_filename, text):
    f = open(output_filename, "w")
    f.write(text)
    f.close()

def html_tablerows_to_hashmap(table_rows):

    table_header = table_rows[0]

    header_to_vals = {}
    headers = []
        
    header_cols = table_header.find_all("th");
    for header in header_cols:
        header_label = header.contents[0].strip()
        if (header_label == "Language(s)"):
            header_label = "Language"

        if (header_label == "Song"):
            header_label = "Title"
            
        headers.append(header_label)
        header_to_vals[header_label] = []

    print("  Headers = " + ",".join(header_to_vals.keys()))
    
    for y in range(1, len(table_rows)):
        tds = table_rows[y].find_all("td");
        for x in range(0,len(tds)):
            val = tds[x]
            header_label = headers[x]
            header_to_vals[header_label].append(val)
            
    return header_to_vals

def convert_cols_to_country_year_recsOLD(header_to_vals,year):

    country_year_recs = {}

    a_href_re = re.compile(r"^.*" + str(year) + r"$")
    
    for country_tag in header_to_vals.get("Country"):
        country = country_tag.find("a",href=a_href_re).string
        # print("**** country = " + country)
        country_year = country+str(year)
        
        country_year_recs[country_year] = { "Country": country, "Year": year }
        
                    
    for key in header_to_vals.keys():
        if (key == "Country"):
            continue

        vals = header_to_vals.get(key)
        
        for l in range(0,len(vals)):
            country_tag = header_to_vals.get("Country")[l]
            country_flag_img = country_tag.find("img")
            country = country_tag.find("a",href=a_href_re).string
            country_year = country+str(year)
            
            val = vals[l]
            
            if key == "Artist":
                a_val = val.find("a")
                if (a_val is not None):
                    val = a_val
            elif key == "Title":
                a_val = val.find("a")
                if (a_val is not None):
                    val = a_val
            elif key == "Language":
                a_val = val.find("a")
                if (a_val is not None):
                    val = a_val
            elif key == "Place":
                span_val = val.find("span")
                if (span_val is not None):
                    val = span_val

            for inner_val in val.contents:
                if (inner_val.string and re.search("[^\s]",inner_val.string)):
                    val = inner_val
                    break
                
            val = val.string.strip()

            #print("country = " + country);
            #print("key = " + key);
            #print("*** storing: " + country + "[" + key + "] = " + val)
            
            country_year_recs[country_year][key] = val

    return country_year_recs


def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):

    a_href_re = re.compile(r"^.*" + str(year) + r"$")
    
    country_list = []
    
    for country_tag in header_to_vals.get("Country"):
        country = country_tag.find("a",href=a_href_re).string
        # print("**** country = " + country)
        country_list.append(country)

    # Only need to worry about a country double up occuring in this country_list
    # OK for there to be a double up between 'accumlated' and country_list, as
    # this is just a sign of a country that was in a semi-final progressing to
    # the final
    
    country_year_recs = {}
    
    for i in range(0,len(country_list)):
        country = country_list[i]
        country_year = country + str(year)

        this_country_year_rec = {}
        if (extra_metadata is not None):
            this_country_year_rec = extra_metadata.copy()
            
        this_country_year_rec["Country"] = country
        this_country_year_rec["Year"]    = year 
        
        for key in header_to_vals.keys():
            val = header_to_vals.get(key)[i]

            if (key == "Country"):
                country_flag_img = val.find("img")
                this_country_year_rec["FlagImg"] = str(country_flag_img)
                continue
        
            if key == "Artist":
                a_val = val.find("a")
                if (a_val is not None):
                    val = a_val
            elif key == "Title":
                a_val = val.find("a")
                if (a_val is not None):
                    val = a_val
            elif key == "Language":
                a_val = val.find("a")
                if (a_val is not None):
                    val = a_val
            elif key == "Place":
                span_val = val.find("span")
                if (span_val is not None):
                    val = span_val
            elif key == "Points":
                key = "VoteGrandTotal"

            for inner_val in val.contents:
                if (inner_val.string and re.search("[^\s]",inner_val.string)):
                    val = inner_val
                    break
                
            val = val.string.strip()

            #print("country = " + country);
            #print("key = " + key);
            #print("*** storing: " + country + "[" + key + "] = " + val)

            this_country_year_rec[key] = val
            
        if (country_year in country_year_recs):
            # 1956, where countries had 2 entries!
            
            country_year_rec = country_year_recs[country_year]
            del country_year_recs[country_year]

            country_year_title = country_year_rec.get("Title")
            country_year_suffix = re.sub(r"\s+","",country_year_title)
            new_country_year_key = country_year + country_year_suffix[0:3]
            country_year_recs[new_country_year_key] = country_year_rec

            this_country_year_title = this_country_year_rec.get("Title")
            this_country_year_suffix = re.sub(r"\s+","",this_country_year_title)
            country_year = country_year + this_country_year_suffix[0:3]
            
        country_year_recs[country_year] = this_country_year_rec

    # Top up the accumulated version with what has been added into country_year_recs
    
    for country_year in country_year_recs.keys():
        accumulated_country_year_recs[country_year] = country_year_recs[country_year]
        
    return accumulated_country_year_recs


def debug_output_country_year_recs(country_year_recs):

    for country_name in country_year_recs.keys():
        country_rec = country_year_recs.get(country_name)

        print("[" + country_name + "]")
              
        for metadata_key in country_rec.keys():
            metadata_val = country_rec.get(metadata_key)
            print("  " + metadata_key + " = " + repr(metadata_val))

def retrieve_article_page(year):
    esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
    esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)

    esc_article_year_html = ""
    if not(os.path.exists(esc_wiki_page_filename)):
        print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
        esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
        esc_article_year_html = esc_article_year_wp.html()

        print("  Saving page to cache")
        write_text_file(esc_wiki_page_filename,esc_article_year_html)
    else:
        print("Retrieving " + esc_wiki_page_file + " from cache")
        esc_article_year_html = read_text_file(esc_wiki_page_filename)

        
    return esc_article_year_html

def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None):

    country_year_recs = {}
    
    esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
    
    #result_ids = [ 'Grand_final', 'Final', 'Results' ]
    
    results_heading_list = []
    for fr_id in result_ids:

        # Manually deal with an exception, where "Final" turns up in a side-bar infobox
        # before the actual Results section in the main page        
        if ((year == 1996) and (fr_id == "Final")):
            continue
            
        results_text_span = esc_year_soup.find("span",id=fr_id)
        if (results_text_span is not None):
            print("  Found Final Results heading with id: " + fr_id);
            results_heading = results_text_span.parent
            results_heading_list.append(results_heading)
            # print("**** parent tag: " + results_heading.name);
            if (stop_at_first):
                break

    # print(repr(results_heading))

    for result_heading in results_heading_list:
        
        results_table = results_heading.findNext('table')
        table_rows = results_table.find_all('tr');
        print("  Number of rows in Results table = " + str(len(table_rows)))

        header_to_vals = html_tablerows_to_hashmap(table_rows)

        convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata)

    print("==========")

    ## debug_output_country_year_recs(country_year_recs)

    # Splice in logo image for that year into each country_year_rec
    #infobox_table = esc_year_soup.find("table",{"class": "infobox"})
    #infobox_td = infobox_table.find("td",{"class": "infobox-image"})
    #infobox_img = infobox_td.find("img")

    #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")

    #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
    #infobox_img = esc_year_soup.select("table.infobox td.infobox-image")
    # infobox_img = esc_year_soup.select("table.infobox")

    #infobox_table = esc_year_soup.find("table",{"class": "infobox"})
    #infobox_img = infobox_table.tbody.tr[0].td[0].a.img

    # Looks like Wikipedia has changed some of its infobox CSS
    # If processing newer downloads, then the following is probably the select
    # statement to use

    # new school!
    #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")

    # old school
    infobox_imgs = esc_year_soup.select("table.infobox tr td a.image img")
    
    if (len(infobox_imgs) == 0):
        print("****")
        print("****!!! No ESC Logo image found!!!")
        print("****")
    else:
        # Some pages include addition image graphics, such as a map showing country entrants
        # => Want the first one
        infobox_logo_img = infobox_imgs[0];

        for country_year_key in country_year_recs.keys():
            country_year_rec = country_year_recs.get(country_year_key)
            
            country_year_rec["YearLogoImg"] = str(infobox_logo_img)

            
    return country_year_recs


def process_category_page(year):

    category_countries = {}
    
    esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
    esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)

    esc_cat_year_html = ""
    if not(os.path.exists(esc_wiki_page_filename)):
        print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
        ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
        #esc_cat_year_wp = wikipedia.page(esc_wiki_page)
        #esc_cat_year_html = esc_cat_year_wp.html()

        response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
        esc_cat_year_html = response.text
        
        print("  Saving page to cache")
        write_text_file(esc_wiki_page_filename,esc_cat_year_html)
    else:
        print("Retrieving " + esc_wiki_page_file + " from cache")
        esc_cat_year_html = read_text_file(esc_wiki_page_filename)

        
    esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
    # print(repr(esc_cat_year_soup.body))
          
    a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")

    esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
    
    for a_tag in esc_cat_a_tags:
        href = a_tag.get("href")
        country = re.search(a_href_re,href).group(1)
        country = country.replace("_"," ")
        
        category_countries[country] = 1
            
    return category_countries