Changeset 35185 for main


Ignore:
Timestamp:
2021-05-24T13:53:40+12:00 (3 years ago)
Author:
davidb
Message:

Code updated to work with 2021 data

Location:
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/03-GEN-VOTING-METADATA.sh

    r35070 r35185  
    2121
    2222
    23 if [ ! -f "$prep_dir/eurovision_song_contest_1975_2019.xlsx" ] ; then
     23if [ ! -f "$prep_dir/eurovision_song_contest_1975_2021.xlsx" ] ; then
    2424    echo "Unzipping $prep_dir/archive.zip:"
    2525   
     
    5050$prep_dir/xlsx-fromcountry-jsonmetadata.py \
    5151    --votingtype "J" \
    52     $prep_dir/eurovision_song_contest_1975_2019.xlsx  \
     52    $prep_dir/eurovision_song_contest_1975_2021.xlsx  \
    5353    $prep_dir/metadata-votes/metadata-votes-fromcountry-jury.json
    5454
     
    5757    $prep_dir/xlsx-fromcountry-jsonmetadata.py \
    5858    --votingtype "T" \
    59     $prep_dir/eurovision_song_contest_1975_2019.xlsx  \
     59    $prep_dir/eurovision_song_contest_1975_2021.xlsx  \
    6060    $prep_dir/metadata-votes/metadata-votes-fromcountry-tele.json
    6161
     
    6464    $prep_dir/xlsx-fromcountry-jsonmetadata.py \
    6565        --votingtype "JT" \
    66         $prep_dir/eurovision_song_contest_1975_2019.xlsx  \
     66        $prep_dir/eurovision_song_contest_1975_2021.xlsx  \
    6767        $prep_dir/metadata-votes/metadata-votes-fromcountry-comb.json
    6868
    6969    if [ $? = 0 ] ; then   
    70         $prep_dir/xlsx-tocountry-jsonmetadata.py $prep_dir/eurovision_song_contest_1975_2019.xlsx  $prep_dir/metadata-votes-tocountry.json
     70        $prep_dir/xlsx-tocountry-jsonmetadata.py $prep_dir/eurovision_song_contest_1975_2021.xlsx  $prep_dir/metadata-votes-tocountry.json
    7171
    7272        if [ $? != 0 ] ; then
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/05-PARSE-ADDITIONAL-METADATA-FROM-WIKIPEDIA.sh

    r35013 r35185  
    2626$prep_dir/esc-wikipedia-download-and-process-votes.py \
    2727    --startyear 1956 \
    28     --endyear 2019 \
     28    --endyear 2021 \
    2929    --cachedir $prep_dir/cache-wikipedia \
    3030    $prep_dir/metadata-esc-year/metadata_esc.json
     
    4141    $prep_dir//esc-wikipedia-download-and-detect-missing-cat-entries.py \
    4242    --startyear 1956 \
    43     --endyear 2019 \
     43    --endyear 2021 \
    4444    --cachedir $prep_dir/cache-wikipedia \
    4545    --queryfile.sparql "$prep_dir/dbpedia--countries-missing-from-esc-category-in-the-year.sparql" \
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/06-COPY-PARSED-ADDITIONAL-METADATA-TO-IMPORT.sh

    r35013 r35185  
    3939/bin/cp "errata-categories/missing-cat-countries/"* "$miscat_dir/."
    4040
     41
     42forced2021_dir=../import/sparqlresults-local--countries-in-esc-by-year-after-1956--with-errata.00000001
     43
     44echo ""
     45echo "Copying (forcing 2021 files into 00000001 import area):"
     46echo "  errata-categories/metadata-esc-year/*2021.nul -> $forced2021_dir/."
     47/bin/cp "errata-categories/metadata-esc-year/"*2021.nul "$forced2021_dir/."
     48
    4149echo ""
    4250
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py

    r35032 r35185  
    146146
    147147
    148     final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     148    #final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     149    final_result_ids = ['Final', 'Results' ]
    149150    semifinal_result_ids = [ 'Semi-final']
    150151    semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ]
     
    153154   
    154155    for year in range(start_year, end_year+1):
     156        if (year == 2020):
     157            print("******")
     158            print("* Skipping year 2020, as no contest held that year")
     159            print("******")
     160            continue
     161       
    155162        print("==========")
    156163
     
    158165        article_final_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,final_result_ids, stop_at_first=True)
    159166        article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_result_ids, stop_at_first=True)
     167       
    160168        if (len(article_semifinal_country_year_recs.keys()) == 0):
    161169            article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False)
     
    174182        ####
    175183
    176         for cat_country in category_countries.keys():
    177             if (cat_country in article_countries):
    178                 del article_countries[cat_country]
    179             else:
    180                 print("Country listed in Category page that did not reach Final/Semifinal: " + cat_country)
    181                
     184        if (year < 2021):
     185                   
     186            for cat_country in category_countries.keys():
     187                if (cat_country in article_countries):
     188                    del article_countries[cat_country]
     189                else:
     190                    print("Country listed in Category page that did not reach Final/Semifinal: " + cat_country)
     191        else:
     192            print("******")
     193            print("* Skipping country in article check for 2021, as those entries no in DBpedia (at time of coding)")
     194            print("******")
     195           
    182196        # Missing countries are the ones left in 'article_countries'
    183197        if (len(article_countries.keys()) == 0):
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

    r35105 r35185  
    161161    all_country_year_recs = []
    162162
    163     final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     163    # final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     164    final_result_ids = [ 'Final', 'Results' ]
    164165   
    165166    for year in range(start_year, end_year+1):
     167        if (year == 2020):
     168            print("******")
     169            print("* Skipping year 2020, as no contest held that year")
     170            print("******")
     171            continue
     172       
    166173        article_year_html = escwikipedia.retrieve_article_page(year)
    167174        country_year_recs = escwikipedia.process_article_page(article_year_html,year,final_result_ids,
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py

    r35128 r35185  
    6565   
    6666    for y in range(1, len(table_rows)):
    67         tds = table_rows[y].find_all("td");
     67        # print("*** table row [" + str(y) + "]: " + repr(table_rows[y]));
     68
     69        tds = table_rows[y].find_all(["th","td"]);
     70        # print("**** tds = " + repr(tds));
    6871        for x in range(0,len(tds)):
    69             val = tds[x]
     72            val = tds[x]           
    7073            header_label = headers[x]
    7174            header_to_vals[header_label].append(val)
     
    137140
    138141def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):
    139 
     142    # print("**** convert_cols_to_country_year_recs: year = " + str(year));
     143    # print("**** header_to_vals = " + repr(header_to_vals));
     144         
    140145    a_href_re = re.compile(r"^.*" + str(year) + r"$")
    141146   
     
    143148   
    144149    for country_tag in header_to_vals.get("Country"):
     150        # print("**** country tag = " + repr(country_tag));
    145151        country = country_tag.find("a",href=a_href_re).string
    146152        # print("**** country = " + country)
     
    279285        results_text_span = esc_year_soup.find("span",id=fr_id)
    280286        if (results_text_span is not None):
    281             print("  Found Final Results heading with id: " + fr_id);
     287            print("  Found Final/Semi-Final Results heading with id: " + fr_id);
    282288            results_heading = results_text_span.parent
    283289            results_heading_list.append(results_heading)
     
    290296    for result_heading in results_heading_list:
    291297       
    292         results_table = results_heading.findNext('table')
    293         table_rows = results_table.find_all('tr');
     298        results_table = results_heading.findNext("table")
     299        table_rows = results_table.find_all("tr");
    294300        print("  Number of rows in Results table = " + str(len(table_rows)))
    295301
Note: See TracChangeset for help on using the changeset viewer.