Ignore:
Timestamp:
2019-06-22T10:10:17+12:00 (5 years ago)
Author:
davidb
Message:

Changes and refactoring to work with the new (XLSX) spreadsheet shared through OneDrive

Location:
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/PREPARE-IMPORT.sh

    r33029 r33188  
    22
    33
    4 python prepare/xls-to-csv.py import/Report\ spreadsheet\ for\ website\ Feb\ 2019.xls
     4#python prepare/xls-to-csv.py import/Report\ spreadsheet\ for\ website\ Feb\ 2019.xls
     5
     6echo "Generating sanitized CSV version of spreadsheet suitable for download through DL"
     7python prepare/xls-to-sanitized-csv.py "import/Archaeological report PDFs 20190620.xlsx"
     8
     9echo "Generating PDF-Bound and Unbound CSV files for use in import.pl"
     10python prepare/xls-to-csv.py "import/Archaeological report PDFs 20190620.xlsx"
     11
     12
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

    r33024 r33188  
    33import os
    44
     5import xlsutil
     6
    57## worksheet_name = "Archaeological reports"
    6 
    7 
    8 # https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
    9 def format_excelval(book, type, value, wanttupledate):
    10     """ Clean up the incoming excel data """
    11     ##  Data Type Codes:
    12     ##  EMPTY   0
    13     ##  TEXT    1 a Unicode string
    14     ##  NUMBER  2 float
    15     ##  DATE    3 float
    16     ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
    17     ##  ERROR   5
    18     returnrow = []
    19     if   type == 2: # TEXT
    20         if value == int(value): value = int(value)
    21     elif type == 3: # NUMBER
    22         datetuple = xlrd.xldate_as_tuple(value, book.datemode)
    23         value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
    24     elif type == 5: # ERROR
    25         value = xlrd.error_text_from_code[value]
    26     return value
    27 
    288
    299letter_to_folder = {
     
    6848
    6949
     50def fixup_cell(col_num,cell_str):
     51    # col == 1 => Author
     52    # col == 2 => Title
     53    # col == 4 => TLA
     54    # col == 5 => AuthorityNo
     55    # col == 6 => SiteNo
     56
     57    if col_num == 1: #Author
     58        cell_str = re.sub('\|?(et al)|(Associates)', '', cell_str)
     59
     60    if col_num == 2: #Title
     61        # Detect any ALL-CAPS titles, and change to title-case
     62        onlyletters = re.sub('[^A-Za-z]+', '', cell_str)
     63        if onlyletters.isupper:
     64            cell_str = cell_str.title()
     65
     66    if col_num == 4: #TLA
     67        if cell_str == "All":     cell_str = ""
     68        if cell_str == "n/a":     cell_str = ""
     69        if cell_str == "various": cell_str = "" # not a meaningful TLA
     70
     71
     72    return cell_str
     73
     74
    7075def excel_to_bound_pdf_csv(excel_filename):
    71     workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
     76    #workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
     77    workbook = xlrd.open_workbook(excel_filename)
    7278    ## worksheet = workbook.sheet_by_name(worksheet_name)
    7379    worksheet = workbook.sheet_by_index(0)
     
    9096    unbound_wr  = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
    9197
    92     # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
    93     # 2. Write out Header line as Greenstone friendly Metadata terms
     98#    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
     99#    # 2. Write out Header line as Greenstone friendly Metadata terms
     100#    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier
     101#    # 3. Process the rest of the file, checking if accompanying
     102#    #     PDF file present or not
     103
     104
     105    # 1. For header-line, build up hashmap of header-names to column number
     106    # 2. Write out header-line as Greenstone friendly Metadata terms
    94107    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier
    95108    # 3. Process the rest of the file, checking if accompanying
    96     #     PDF file present or not
     109    #     PDF file present or not and only printing out the header-names
     110    #     specified in 'sanitized_headers'
     111
    97112
    98113    num_rows = worksheet.nrows
    99114
    100115    # 1, Skip lines until Header encountered
    101     row_i = 0
    102     found_header = False
    103     while row_i<num_rows:
    104         first_cell = worksheet.cell_value(row_i,0);
    105         if first_cell == "Report Identifier":
    106             found_header = True
    107             break
    108         print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
    109         row_i = row_i + 1
     116#    row_i = 0
     117#    found_header = False
     118#    while row_i<num_rows:
     119#        first_cell = worksheet.cell_value(row_i,0);
     120#        if first_cell == "Report Identifier":
     121#            found_header = True
     122#            break
     123#        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
     124#        row_i = row_i + 1
     125
     126    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
    110127
    111128    # 2. Process Header into Greenstone friendly metadata terms     
    112     if found_header:
     129    if header_names_mapping is None:
     130        print "Failed to find metadata label 'Report Identifier' in column 0"
     131        exit()
     132    else:
    113133        entry_utf8_row = []
    114         for entry in worksheet.row_values(row_i):
    115             if entry == "Report Identifier":
    116                 entry = "hnz.Identifier"
    117 
    118             if entry == "Title":
    119                 entry = "dc.Title"
    120 
    121             entry_utf8 = unicode(entry).encode("utf-8")
    122             metadata_entry_utf8 = entry_utf8.replace(" ", "")
    123 
    124             entry_utf8_row.append(metadata_entry_utf8)
     134        row_i = 0;
     135
     136        for header_name in xlsutil.sanitized_headers:
     137
     138            if header_name in header_names_mapping:
     139                header_col_j = header_names_mapping[header_name]
     140
     141                header_cell_value = worksheet.cell_value(row_i,header_col_j)
     142               
     143                if header_cell_value == "Report Identifier":
     144                    header_cell_value = "hnz.Identifier"
     145
     146                if header_cell_value == "Title":
     147                    header_cell_value = "dc.Title"
     148
     149                if header_cell_value == "RelevantTLA's":
     150                    header_cell_value = "TLA"
     151
     152                header_cell_value_utf8 = unicode(header_cell_value).encode("utf-8")
     153                metadata_name_utf8 = header_cell_value_utf8.replace(" ", "")
     154
     155                entry_utf8_row.append(metadata_name_utf8)
     156
     157            else:
     158                print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")
     159
    125160        unbound_wr.writerow(entry_utf8_row)
    126161
     
    128163        entry_utf8_row.insert(0, "Filename")
    129164        pdfbound_wr.writerow(entry_utf8_row)
    130 
     165                     
     166        # 3. Process the rest of the file ...
    131167
    132168        row_i = row_i + 1
     
    137173            found_pdf = False
    138174
    139             for col_j in range(num_cols):
    140                 cell = worksheet.cell(row_i,col_j)
    141 
    142                 cell_type  = worksheet.cell_type(row_i,col_j)
    143                 cell_value = worksheet.cell_value(row_i,col_j)
    144 
    145                 formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
    146 
    147                 if col_j == 0:
    148                     # Check to see if companion PDF file present
    149                     # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
    150                     id = formatted_cell.replace(" ","")
    151 
    152                     pdf_file = id_to_relative_pdf_file(id)
    153                     pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
    154                     PDF_file = pdf_file_root + ".PDF"
    155 
    156                     pdf_filename = os.path.join(excel_dirname,pdf_file)
    157                     PDF_filename = os.path.join(excel_dirname,PDF_file)
    158 
    159                     if os.path.exists(pdf_filename):
    160                         found_pdf = True
    161                         formatted_utf8_row.insert(0, pdf_file)
    162                     elif os.path.exists(PDF_filename):
    163                         found_pdf = True
    164                         formatted_utf8_row.insert(0, PDF_file)
    165                     else:
    166                         print "Unbound id: '" + id + "'"
    167 
    168                 if isinstance(formatted_cell, basestring):
    169                     # Remove any trailing whitespace. 
    170                     # Newline at end particular harmful for a entry in the CSV file
    171                     formatted_cell = formatted_cell.rstrip();
    172 
    173                 formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
    174                 formatted_utf8_row.append(formatted_cell_utf8)
     175#            for col_j in range(num_cols):
     176            for header_name in xlsutil.sanitized_headers:
     177
     178                if header_name in header_names_mapping:
     179                    col_j = header_names_mapping[header_name]
     180
     181#                    cell = worksheet.cell(row_i,col_j)
     182
     183#                    cell_type  = worksheet.cell_type(row_i,col_j)
     184                    cell_value = worksheet.cell_value(row_i,col_j)
     185
     186#                    formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False)
     187                    formatted_cell_value = xlsutil.format_if_int(cell_value)
     188
     189#                    if col_j == 0:
     190                    if header_name == "Report Identifier":
     191                        # Check to see if companion PDF file present
     192                        # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell_value+".pdf")
     193                        id = formatted_cell_value.replace(" ","")
     194                       
     195                        pdf_file = id_to_relative_pdf_file(id)
     196                        pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
     197                        PDF_file = pdf_file_root + ".PDF"
     198                   
     199                        pdf_filename = os.path.join(excel_dirname,pdf_file)
     200                        PDF_filename = os.path.join(excel_dirname,PDF_file)
     201
     202                        if os.path.exists(pdf_filename):
     203                            found_pdf = True
     204                            formatted_utf8_row.insert(0, pdf_file)
     205                        elif os.path.exists(PDF_filename):
     206                            found_pdf = True
     207                            formatted_utf8_row.insert(0, PDF_file)
     208                        else:
     209                            print "Unbound id: '" + id + "'"
     210
     211                    if isinstance(formatted_cell_value, basestring):
     212                        # Remove any trailing whitespace. 
     213                        # Newline at end particular harmful for a entry in the CSV file
     214                        formatted_cell_value = formatted_cell_value.rstrip();
     215
     216                    formatted_cell_value_utf8 = unicode(formatted_cell_value).encode("utf-8")
     217
     218                    ## Perform any cell transformations to make DL used spreadsheet
     219                    ## cleaner to build
     220                    # formatted_cell_value_utf8 = fixup_cell(col_j,formatted_cell_value_utf8)
     221
     222                    formatted_utf8_row.append(formatted_cell_value_utf8)
     223                else:
     224                    print "Warning: No column number mapping for header name \""+header_name+"\" => skipping"
     225
    175226            if found_pdf:
    176227                pdfbound_wr.writerow(formatted_utf8_row)
     
    179230
    180231            row_i = row_i + 1
    181     else:
    182         print "Failed to find metadata label 'Report Identifier' in column 0"
    183232
    184233
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

    r33174 r33188  
    33import os
    44
     5import xlsutil
     6
    57## worksheet_name = "Archaeological reports"
    68
    7 sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No",
    8                       "Authority No", "Site No", "Relevant TLA's", "Record Type" ]
    99
    1010def excel_to_sanitized_csv(excel_filename):
     
    2727
    2828    # 1. For header line, Build up hashmap of header-names to column number
    29     # 2. Write out CSV file for only the header-names in 'keep_header_names'
     29    # 2. Write out CSV file for only the header-names in 'sanitized_headers'
    3030
    3131    num_rows = worksheet.nrows
    3232
    33     header_names_mapping = {}
     33#    row_i = 0
     34#    num_header_cols = worksheet.row_len(row_i)
    3435
    35     # 1. For header line, Build up hashmap of header-names to column number
    36     # Assume header line is first row of spreadsheet
    37     row_i = 0
    38     num_header_cols = worksheet.row_len(row_i)
     36    # 1. get header-map
     37    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
    3938
    40     found_header = False
    41     for col_j in range(num_header_cols):
    42         header_cell_value = worksheet.cell_value(row_i,col_j)
    43         if header_cell_value == "Report Identifier":
    44             found_header = True
    45 
    46         header_names_mapping[header_cell_value] = col_j
    47 
    48     if not(found_header):
     39#    found_header = False
     40#    for col_j in range(num_header_cols):
     41#        header_cell_value = worksheet.cell_value(row_i,col_j)
     42#        if header_cell_value == "Report Identifier":
     43#            found_header = True
     44#
     45#        header_names_mapping[header_cell_value] = col_j
     46#
     47#    if not(found_header):
     48    if header_names_mapping is None:
    4949        print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)"
    5050        exit(1)
    5151
    5252    # Move on to the start of the data values
    53     row_i = row_i + 1
     53    row_i = 1
    5454
     55    # 2. Write out CSV file ...
    5556    while row_i<num_rows:
    5657        # Work through sanitized header names building up row of utf8 values
    5758        sanitized_utf8_row = []
    5859
    59         for header_name in sanitized_headers:
     60        for header_name in xlsutil.sanitized_headers:
    6061
    6162            if header_name in header_names_mapping:
     
    6465                cell_value = worksheet.cell_value(row_i,header_col_j)
    6566
    66                 #formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
    6767
    68                 # Numbers from worksheet are represented as floating-point type
    69                 # This causes a problem when it is an 'int' as it then gets
    70                 # written out as a floating point number (with a '.0')
    71                 # => test for float and when there is no value after
    72                 #    the decimal point, explicitly cast to 'int'
    73                 if type(cell_value) is float:
    74                     if cell_value == int(cell_value): cell_value = int(cell_value)
    7568
    7669                cell_value_utf8 = unicode(cell_value).encode("utf-8")
Note: See TracChangeset for help on using the changeset viewer.