'''
Back up google spreadsheet. Requires that the sheet is viewable to the world.
Can run this at the bash prompt like so:
    python get_ggle_ss.py 
Also output spreadsheet content to json files. 

'''

import os
import sys
import urllib
import datetime
import openpyxl as xp
import json

#---------------------
# Set Paths and Links
#---------------------
# pertinent paths
backup_dir      = '.'                                           # Directory in which to save the backup spreadsheet.
variable_dir    = os.path.join(backup_dir, 'variable_tables')   # Directory in which to put the variable json files.

# pertinent URLS
url_google_drive = 'https://drive.google.com/drive/u/1/folders/1UPZoy0oSWz3ZQPiH5twI1T5mWAzvpWNs' # drive dir containing spreadsheets

# list of spreadsheets to download 
#   notes:  - 'override_id' is used when exporting json files to tell the script that its json files
#             are produced by overriding the dictionary contents of the dictionary for said id. 
#             i.e. if 'source_id' = CanESM5-CanOE; 'override_id' = 'CanESM5', then when producing
#             the CanESM5-CanOE variable tables, the dictionaries will be created by overriding the 
#             contents of that used for CanESM5.
#           - if file name is provided, then we check that it matches the filename found in the 
#             google drive link
google_sheet_lst = [] 
d = {
    'url_google_sheet'  : 'https://docs.google.com/spreadsheets/d/1JLsCcp9CCP-39ZE5OCN0xaerMwLe4EvmYj-a4-6KWjM',
    'url_google_drive'  : url_google_drive,
    'filename'          : 'output_variables_dreq01.00.31.xlsx',
    'source_id'         : 'CanESM5',
}
google_sheet_lst.append(d)
d = {
    'url_google_sheet'  : 'https://docs.google.com/spreadsheets/d/1tsymjjjWPVU1Jf5wfsgp8jUE2Y-yJb5qVNPD0hRIAM4',
    'url_google_drive'  : url_google_drive,
    'filename'          : 'output_variables_dreq01.00.31_CANOE.xlsx',
    'source_id'         : 'CanESM5-CanOE',
    'override_id'       : 'CanESM5'
}
google_sheet_lst.append(d)

#----------
# Set Flags
#----------
delete_ss               = True
export_sheet_as_json    = True
add_json_metadata       = True
use_dreqPy              = not True	# 18dec.18, JA: don't turn this on, but I'm leaving the option here in case we want to incorporate this later
json_column_sort        = True

if json_column_sort:
    # Show the parameters (columns) in the json file ordered in the same way as columns in the spreadsheet. 
    # For json_column_sort=False the parameters (columns) will show up in case-sensitive alphabetical order.
    from collections import OrderedDict as odict
    def make_odict(d, l):
        '''Create an ordered dict (odict) by adding keys of dict d in the order given by list l.'''
        d1 = odict()
        for p in l: d1.update({p: d[p]})
        return d1

##################
# Download sheets
for workbook_info in google_sheet_lst:
    url_google_sheet = workbook_info['url_google_sheet']
    url_google_drive = workbook_info['url_google_drive']

    # Get name of file to back up. 
    urlname     = url_google_drive
    url         = urllib.urlopen(urlname)
    w           = url.read()  
    sheet_hash  = url_google_sheet.split('/')[-1]
    l           = w.split(sheet_hash)
    assert len(l) == 2
    l = l[1].split('google-apps.spreadsheet')
    l = l[0].split('\\x22')
    l = [s for s in l if s.startswith('output_variables') and 'dreq' in s]
    assert len(l) == 1
    l_filename = l

    ext = 'xlsx'
    if 'filename' in workbook_info:
        # Check that we got the expected filename.  
        filename0 = '_'.join(l_filename) + '.' + ext
        assert filename0 == workbook_info['filename'], "Downloaded filename doesn't match that specified"

    # Add date & time of backup to the filename.
    #   note: - h,m,s characters to indicate hours, min, seconds because using ':' 
    #           can cause problems when manipulating the filename later
    time_fmt    = '%d%b%Y_%Hh%Mm%SsUTC'
    s_date      = datetime.datetime.utcnow().strftime(time_fmt)
    l_filename += ['_BACKUP']
    l_filename += [s_date]
    filename    = '_'.join(l_filename) + '.' + ext

    # Export the google spreadsheet as an xlsx file, 
    #   and add a key in the dictionary that specifies the resulting file location
    urlname     = os.path.join(url_google_sheet, 'export?exportFormat=' + ext)
    filepath    = os.path.join(backup_dir, filename)
    if not os.path.isdir(backup_dir): 
        os.makedirs(backup_dir)
        os.chmod(backup_dir, 0755)  
    urllib.urlretrieve(urlname, filepath)
    print 'Wrote backup file: ' + filepath

    # Make the backup file read-only to prevent accidental modification. 
    os.chmod(filepath, 0444)
    workbook_info['long_filepath'] = filepath

#############################
# Export sheets to json files
if export_sheet_as_json:
    # Create dictionary containing all MIP table information from the downloaded spreadsheets,
    # with the 'source_id' for the specified spreadsheet being the topmost key in the dictionary, 
    # and mip tables directly beneath.
    var_tables = {}
    if use_dreqPy:
        '''
        This assumes the data request python software (dreqPy) is loaded as module dq. 
        '''
        tvl = ('mipTable', 'label')	# ver. 01.00.27: this uniquely identifies the items
        err_str = 'attributes in tuple tvl do not uniquely identify CMORvar items (data request version: ' + dreqPy_version + ')' 
        assert q.checkUnique('CMORvar', tvl, dq), err_str
        # Get list of all allowable CMOR variable names. These are the short names (e.g. "tas", "ua").
        l_CMORvar = sorted([itm.label for itm in dq.coll['CMORvar'].items])
        # If for some reason there are allowable CMOR var names that aren't in the data request software,
        # add them to l_CMORvar. The code below will skip rows that don't have a CMOR var name listed in l_CMORvar.
        
    # load each spreadsheets into dictionary var_tables, copying and overriding data from dictionary for
    # source id = 'override_id'.
    for workbook_info in google_sheet_lst:
        source_id = workbook_info['source_id']
        print("Populating dictionary for {}".format(source_id))

        #===========================
        # initiate nested dictionary
        #===========================
        # assume that we aren't overriding anything
        is_override = False 
        if 'override_id' in workbook_info:
            is_override = True
            # copy dictionary from override_id
            override_id = workbook_info['override_id']
            try:
                var_tables[source_id] = var_tables[override_id]
            except KeyError as exc:
                print('Trying to build {} dictionary from {} dictionary, yet the latter hasnt been created!'.format(source_id,override_id))
                print('Make sure the dictionary for {} gets created FIRST'.format(override_id))
                print('exiting....')
                sys.exit(1)

            # create override info dictionary 
            override_info = { 'override source' : override_id,
                              'override vars'   : [],
                              'new vars'        : [] }
        else:
            # initiate empty dictionary
            var_tables[source_id] = {}

        #============================
        # load contents of work book
        #============================
        filepath = workbook_info['long_filepath']
        workbook = xp.load_workbook(filepath, read_only=True, data_only=True)
        print('Loaded spreadsheet: {}'.format(filepath))

        #==================================================
        # Loop over sheets and create nested dicts for each,
        #   storing them in var_tables[source_id]
        #==================================================
        miptables = workbook.sheetnames
        for miptable in miptables:
            # initiate dictionary
            if miptable not in var_tables[source_id]:
                var_tables[source_id][miptable] = {}
            
            # Read contents of the sheet into a list of lists (like a 2D matrix).
            worksheet = workbook[miptable]
            wrksht_rows = []
            for k,r in enumerate(worksheet.rows):
                col = [c.value for c in r]
                wrksht_rows.append(col)
            
            # translate worksheet rows into dictionaries
            col_headers  = wrksht_rows[0]
            p_CMORvar    = 'CMOR Name'
            wrksht_var_dicts = []
            for row in wrksht_rows:
                # skip column headers, which can appear more than once, and rows with too few columns
                if row == col_headers: continue
                if len(row) < len(col_headers): continue

                # build dictionary for each row, keyed off column headers
                wrksht_var_dict = {s : row[k] for k,s in enumerate(col_headers)}
                wrksht_var_dicts.append(wrksht_var_dict)
                        
            # store each variable dictionary in the nested dictionary, var_tables
            for wrksht_var_dict in wrksht_var_dicts:
                var = wrksht_var_dict[p_CMORvar]
                
                # Handle the possible case of entries like 'DREQ: hurs, TWIKI: rhs'
                s0 = 'DREQ:'
                if var.startswith(s0):
                    var = var.strip(s0).strip()
                    var = var.split(',')[0]
                else:
                    # make sure not to miss a case of this
                    assert s0 not in var, "{} should not be in the var name".format(s0)
                    
                if use_dreqPy:
                    if var not in l_CMORvar:
                        #print ' '*20, miptable, var
                        continue

                if is_override: 
                    if var in var_tables[source_id][miptable]: 
                        print('Warning: overwriting CMOR variable {} in MIP table {}'.format(var,miptable))
                        override_info['override vars'].append("{}-{}".format(var,miptable))
                    else:
                        # this is a variable unique to the override tables
                        override_info['new vars'].append("{}-{}".format(var,miptable))
                
                if json_column_sort:
                    var_tables[source_id][miptable][var] = make_odict(wrksht_var_dict, col_headers)
                else:
                    var_tables[source_id][miptable][var] = wrksht_var_dict

        #==============================
        # delete spreadsheet if desired						
        #==============================
        if delete_ss:
            print("Wrote {} into dictionary.. \n\t .. deleting xlsx file".format(filepath))
            os.remove(filepath)
    
        #===============================
        # write dictionary to JSON files
        #===============================
        # check that necessary directories exists
        if not os.path.isdir(variable_dir): 
            os.makedirs(variable_dir)
            os.chmod(variable_dir, 0755)
        source_id_dir = os.path.join(variable_dir,source_id)
        if not os.path.isdir(source_id_dir):
            os.makedirs(source_id_dir)
            os.chmod(source_id_dir, 0755)

        # write each mip table to a different file
        miptables = sorted( var_tables[source_id].keys(), key=str.lower )
        for miptable in miptables:
            jsonfile = 'CMIP6_{}.json'.format(miptable)
            filepath = os.path.join(source_id_dir, jsonfile)

            if add_json_metadata:
                # Include info about where the json contents came from
                metadata_key = '_metadata_'

                # ensure we aren't overwriting a metadata field from the CMIP6 tables
                #   - this likely isn't needed.. we'd likely know if they added this field.
                if not is_override:
                    # this would always be true for override tables
                    assert metadata_key not in var_tables[source_id][miptable]

                # get source file list
                source_files = os.path.basename(workbook_info['long_filepath'])
                if is_override: 
                    #=======================
                    # NOTE: it is possible that this table wasn't in the override_id's dictionary
                    #=======================
                    try:
                        add_src_files = var_tables[override_id][miptable][metadata_key]['Source file']
                    except KeyError as exc:
                        print(var_tables[override_id][miptable])
                        sys.exit(1)
                    source_files  += " {}".format(add_src_files)

                # create meta data dictionary and store
                d = {
                    'Source file'   : source_files,
                    'MIP table'     : miptable
                }
                var_tables[source_id][miptable][metadata_key] = d

            sort_keys = True
            if json_column_sort:
                var_tables[source_id][miptable] = make_odict( var_tables[source_id][miptable], sorted(var_tables[source_id][miptable].keys()) )
                sort_keys = False
                # If input is an ordered dict, need to turn off alphabetical sorting (the sort_keys=True option) to preserve the order that's already specified in the dict. 
            
            with open(filepath, 'w') as f:
                w = json.dumps(var_tables[source_id][miptable], indent=2, sort_keys=sort_keys)
                f.write(w)
                os.chmod(filepath, 0744)
                print('Wrote output file: ' + filepath)

            if not True:
                # testing that I can reload the json file and get the same info back
                with open(filepath, 'r') as f:
                    d = json.load(f)
                    print('Loaded json file: ' + filepath)
                    assert d == var_tables[miptable]

        # write out override info file if necessary
        if is_override: 
            filepath = os.path.join(source_id_dir,'override_info.json')
            with open(filepath, 'w') as f:
                json_str = json.dumps(override_info, indent=2)
                f.write(json_str)
            
