Changeset 33202


Ignore:
Timestamp:
2019-06-22T17:13:38+12:00 (5 years ago)
Author:
davidb
Message:

Bound and Unbound CSV files changed to pring out all non-empty header columns

Location:
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/PREPARE-IMPORT.sh

    r33192 r33202  
    77python prepare/xls-to-sanitized-csv.py \
    88  "import/Archaeological report PDFs 20190620.xlsx" \
    9   "for-download/Archaeological-Reports-20190620.xlsx"
     9  "for-download/Archaeological-Reports-20190620.csv"
    1010
    1111echo "Generating PDF-Bound and Unbound CSV files for use in import.pl"
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

    r33191 r33202  
    103103    # 3. Process the rest of the file, checking if accompanying
    104104    #     PDF file present or not and only printing out the header-names
    105     #     specified in 'sanitized_headers'
     105    #     as long as it has a non-empty header-name
     106##    #     specified in 'sanitized_headers'
    106107
    107108    num_rows = worksheet.nrows
     
    117118
    118119        entry_utf8_row = []
    119         row_i = 0;
    120 
    121         for header_name in xlsutil.sanitized_headers:
    122 
    123             if header_name in header_names_mapping:
     120        num_header_cols = worksheet.row_len(xlsutil.header_row_pos)
     121
     122        for col_j in range(num_header_cols):
     123##        for header_name in xlsutil.sanitized_headers:
     124
     125            header_name = worksheet.cell_value(xlsutil.header_row_pos,col_j)
     126
     127            if header_name and header_name.strip():
     128####                print "*** 2. header_name = " + header_name
     129
    124130                header_col_j = header_names_mapping[header_name]
    125 
    126                 header_cell_value = worksheet.cell_value(row_i,header_col_j)
     131##            if header_name in header_names_mapping:
     132##                header_col_j = header_names_mapping[header_name]
     133
     134##                header_cell_value = worksheet.cell_value(row_i,header_col_j)
    127135               
    128                 if header_cell_value == "Report Identifier":
    129                     header_cell_value = "hnz.Identifier"
    130 
    131                 if header_cell_value == "Title":
    132                     header_cell_value = "dc.Title"
    133 
    134                 if header_cell_value == "RelevantTLA's":
    135                     header_cell_value = "TLA"
    136 
    137                 header_cell_value_utf8 = unicode(header_cell_value).encode("utf-8")
    138                 metadata_name_utf8 = header_cell_value_utf8.replace(" ", "")
     136                if header_name == "Report Identifier":
     137                    header_name = "hnz.Identifier"
     138
     139                if header_name == "Title":
     140                    header_name = "dc.Title"
     141
     142                if header_name == "RelevantTLA's":
     143                    header_name = "TLA"
     144
     145                header_name_utf8 = unicode(header_name).encode("utf-8")
     146                metadata_name_utf8 = header_name_utf8.replace(" ", "")
    139147
    140148                entry_utf8_row.append(metadata_name_utf8)
    141149
    142             else:
    143                 print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")
     150##            else:
     151##                print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")
    144152
    145153        unbound_wr.writerow(entry_utf8_row)
     
    151159        # 3. Process the rest of the file (metadata values) ...
    152160
    153         row_i = row_i + 1
     161        row_i = xlsutil.header_row_pos + 1
    154162        while row_i<num_rows:
    155163            num_cols = worksheet.row_len(row_i)
     
    158166            found_pdf = False
    159167
    160 #            for col_j in range(num_cols):
    161             for header_name in xlsutil.sanitized_headers:
    162 
    163                 if header_name in header_names_mapping:
     168            for col_j in range(num_cols):
     169##            for header_name in xlsutil.sanitized_headers:
     170                header_name = worksheet.cell_value(xlsutil.header_row_pos,col_j)
     171
     172                if header_name and header_name.strip():
    164173                    col_j = header_names_mapping[header_name]
     174##                if header_name in header_names_mapping:
     175##                    col_j = header_names_mapping[header_name]
    165176
    166177                    cell_value = worksheet.cell_value(row_i,col_j)
     
    201212
    202213                    formatted_utf8_row.append(formatted_cell_value_utf8)
    203                 else:
    204                     print "Warning: No column number mapping for header name \""+header_name+"\" => skipping"
     214##                else:
     215##                    print "Warning: No column number mapping for header name \""+header_name+"\" => skipping"
    205216
    206217            if found_pdf:
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

    r33192 r33202  
    3636        exit(1)
    3737
    38     # Move on to the start of the data values
    39     row_i = 1
    40 
    4138    # 2. Write out CSV file ...
     39    # Work through all the row to the spreadsheet, including the header-names
     40    row_i = xlsutil.header_row_pos
     41   
    4242    while row_i<num_rows:
    4343        # Work through sanitized header names building up row of utf8 values
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py

    r33191 r33202  
    2323
    2424
     25header_row_pos = 0
     26
    2527def getHeaderMappingToColNum(worksheet):
    2628
     
    3032    header_names_mapping = {}
    3133
    32     row_i = 0
    33     num_header_cols = worksheet.row_len(row_i)
     34    num_header_cols = worksheet.row_len(header_row_pos)
    3435
    3536    found_header = False
    3637    for col_j in range(num_header_cols):
    37         header_cell_value = worksheet.cell_value(row_i,col_j)
     38        header_cell_value = worksheet.cell_value(header_row_pos,col_j)
    3839        if header_cell_value == "Report Identifier":
    3940            found_header = True
    4041
    41         header_names_mapping[header_cell_value] = col_j
     42        if header_cell_value and header_cell_value.strip():
     43            header_names_mapping[header_cell_value] = col_j
    4244
    4345    if not found_header:
Note: See TracChangeset for help on using the changeset viewer.