Changeset 34976 for main


Ignore:
Timestamp:
2021-03-26T17:46:12+13:00 (3 years ago)
Author:
davidb
Message:

Output method added in that prints (for now) the metadata for country+year it is building up

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

    r34975 r34976  
    44
    55import os
     6import re
    67
    78import bs4
     
    2728
    2829    table_header = table_rows[0]
    29     # print("table header = " + repr(table_header))
    3030
    3131    header_to_vals = {}
     
    3838            header_label = "Language"
    3939           
    40         # print("header: '" + header_label+"'")
    41 
    4240        headers.append(header_label)
    4341        header_to_vals[header_label] = []
     
    5452    return header_to_vals
    5553
    56 def convert_cols_to_country_recs(header_to_vals):
     54def convert_cols_to_country_recs(header_to_vals,year):
    5755
    5856    country_recs = {}
    5957
    60     for country in header_to_vals.get("Country"):
     58    for country_tag in header_to_vals.get("Country"):
     59        # print("@@@@@ td = " + repr(country_tag));
     60       
     61        country = country_tag.find("a",href=re.compile(r"^.*" + str(year) + r"$")).string
     62        # print("**** country = " + country)
     63       
    6164        country_recs[country] = {}
    6265                   
     
    6871       
    6972        for l in range(0,len(vals)):
    70             country = header_to_vals.get("Country")[l]
     73            country_tag = header_to_vals.get("Country")[l]
     74            country_flag_img = country_tag.find("img")
     75            country = country_tag.find("a",href=re.compile(r"^.*" + str(year) + r"$")).string
     76           
    7177            val = vals[l]
    72 
     78            print("@@@ val = " + repr(val))
     79           
     80            if key == "Artist":
     81                a_val = val.find("a")
     82                if (a_val is not None):
     83                    val = a_val
     84            elif key == "Song":
     85                a_val = val.find("a")
     86                if (a_val is not None):
     87                    val = a_val
     88            elif key == "Language":
     89                a_val = val.find("a")
     90                if (a_val is not None):
     91                    val = a_val
     92            elif key == "Place":
     93                span_val = val.find("span")
     94                if (span_val is not None):
     95                    val = span_val
     96
     97            for inner_val in val.contents:
     98                if (inner_val.string and re.search("[^\s]",inner_val.string)):
     99                    val = inner_val
     100                    break
     101               
     102            val = val.string.strip()
     103
     104            #print("country = " + country);
     105            #print("key = " + key);
     106            #print("*** storing: " + country + "[" + key + "] = " + val)
     107           
    73108            country_recs[country][key] = val
    74109
    75110    return country_recs
    76111
     112def output_country_recs(country_recs):
     113
     114    for country_name in country_recs.keys():
     115        country_rec = country_recs.get(country_name)
     116
     117        print("[" + country_name + "]")
     118             
     119        for metadata_key in country_rec.keys():
     120            metadata_val = country_rec.get(metadata_key)
     121            print("  " + metadata_key + " = " + repr(metadata_val))
     122       
    77123def process_category_page(year):
    78124    esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
     
    118164    header_to_vals = html_tablerows_to_hashmap(table_rows)
    119165
    120     convert_cols_to_country_recs(header_to_vals)
    121    
     166    country_recs = convert_cols_to_country_recs(header_to_vals,year)
     167
     168    print()
     169    print("==========")
     170
     171    output_country_recs(country_recs)
    122172   
    123173   
Note: See TracChangeset for help on using the changeset viewer.