#!/usr/bin/env python
'''
Script to generate retracted_vars.json file.
This file keeps track of retracted CCCma datasets.
It is ingested by pycmor when running in production mode (--prod flag) to
ensure that we don't accidentally use the wrong version string for any
dataset that has been previously retracted.
This script uses functionality from the search_esgf utility. 

How to generate the retracted_vars.json file
--------------------------------------------

First install search_esgf:

    git clone git@gitlab.science.gc.ca:rja001/search_esgf.git

and then do

    cd search_esgf
    ./setup.py retracted_vars
    cd searches/retracted_vars/

"retracted_vars" is the working directory name. (It doesn't have to be called
"retracted_vars", its name is arbitrary.) A script called 'search.py' will be
in the working dir, but ignore this. Copy this script (get_retracted_vars.py)
into the working dir and then run it. It can be run at the shell prompt, e.g.

    python get_retracted_vars.py

or by starting a python or ipython session, e.g. 

    cccanpy
    ipython

and then in the ipython session:

    execfile('get_retracted_vars.py')

This should create the retracted_vars.json file in the working dir. 
Another file, retracted_vars_info.txt, is also created. This has some info
about how the retracted_vars.json file was created, to aid with
reproducibility. Both files should be committed to the experiments repo.

Note, since the search looks for all published datasets (according to the
search filters specified below) it can take a few minutes to run. 
'''

search_esgf = True
write_retracted_list = True


# Search filters
#
# "filters": is a list of dicts, each of which defines a search by giving a
# set of search parameters (also referred to as search facets). A "dataset"
# in CMIP6 parlance refers to one output field from one model run (e.g. 
# surface temperature from one ensemble member of the historical experiment
# from one model is a single "dataset"). The following searchable parameters
# identify datasets:
#
#   parameter           example value
#   ---------           -------------
#   'mip_era'           'CMIP6'
#   'activity_drs'      'ScenarioMIP'
#   'institution_id'    'CCCma'
#   'source_id'         'CanESM5'
#   'experiment_id'     'ssp245'
#   'member_id'         'r1i1p1f1'
#   'table_id'          'Amon'
#   'variable_id'       'tas'
#   'grid_label'        'gn'
# 
# Any of these parameters can be specified in the filter dicts in the 
# "filters" list. The parameters narrow down the search, so to search for
# all possible values of a given parameter just leave that parameter out.
# For example: to search for all available ensemble members, leave out the
# 'member_id' parameter.

# A dataset is also identified by its version string (e.g. 'v20190429').
# By default the search will look for the most recent version of any
# given dataset. To instead search for a range of versions, specify a 
# min and/or max version string (as done under "More Search Options" on the
# ESGF website), for example:
#   'min_version'   : 20190301,
#   'max_version'   : 20190424,
# This works as <=, >= operators, e.g. min_version=20190306 will return a
# version v20190306, but not v20190305. 
#
# Any number of filters (dicts) can be concatenated into list "filters".
# All of them will be searched and the results will be consolidated to remove
# any redundancies (i.e. the same dataset won't be counted / downloaded more
# than once).
filters = []

# Set filters as needed to find all retracted CCCma datasets. 
f = {   
    'source_id'     : ['CanESM5'],
    'min_version'   : 20190301, # no CCCma CMIP6 data has been published with a version earlier than this
#    'max_version'   : 20190424, 


    #'variable_id'   : ['siconc'], # just for testing (to limit the number of datasets found)

    }
filters.append(f)


###############################################################################
import sys
import os
import datetime
import json
import inspect
bin_dir = '../../bin'
if bin_dir not in sys.path: sys.path.append(bin_dir)
import esgfsearch as es
reload(es)

# Get latest git commit of the search_esgf repo.
cmd = 'git rev-parse HEAD'
filename = 'latest_git_commit'
os.system('{} > {}'.format(cmd, filename))
with open(filename, 'r') as f: latest_git_commit = f.read().strip()
# (Note: using os.system here instead of subprocess.Popen because subprocess
# seems to have a large memory footprint that can cause problems.)

require_clean_repo = True
if require_clean_repo: 
    # Find out if repo has any uncommitted changes. 
    # We want there to be none, so as to guarantee that the repo commit hash
    # recorded in the retracted_vars.json file indicates the code that was
    # used to run the script.
    cmd = 'git status --porcelain'
    filename = 'repo_status'
    os.system('{} > {}'.format(cmd, filename))
    with open(filename, 'r') as f: repo_status = f.read().strip()
    assert repo_status in [''], 'There are uncommitted changes in the search_esgf git repository.'

###############################################################################

# Dict to map parameter names as given in d_found[dataset]['doc'] to those
# used in pycmor (i.e. in the original retracted_vars.json file that Andrew
# created).
d_pycmor2esgf = {
    # pycmor            # ESGF search (the 'doc' dict returned by the search)
    'experiment_id'     : 'experiment_id',
    'member_id'         : 'member_id',
    'name'              : 'master_id',
    'publish_time'      : '_timestamp',
    'Source ID'         : 'source_id',
    'table'             : 'table_id',
    'varname'           : 'variable_id',
    'version'           : 'version',
}

if search_esgf:
    # Search ESGF for retracted datasets. 
    
    # lp_doc = parameters from the search results that we want to keep.
    lp_doc = [d_pycmor2esgf[p] for p in d_pycmor2esgf]
    lp_doc.append('retracted')
    
    d_found, d_info_found = es.search(filters, keep_params=lp_doc)

if write_retracted_list:
    # Get list of retracted datasets
    l_retracted = [dataset for dataset in d_found if d_found[dataset]['doc']['retracted']]

    l_retracted.sort()
    print('\nFound {0} retracted datasets'.format(len(l_retracted)))
    sort_datasets_by_timestamp = True
    if sort_datasets_by_timestamp:
        # (For retracted datasets, I believe the timestamp is the retraction time)
        lt = [(d_found[dataset]['doc']['_timestamp'], dataset) for dataset in l_retracted]
        lt.sort()
        l_retracted = zip(*lt)[1]
    # Formatting string to reproduce the time format used by esglist_datasets
    time_fmt = '%Y-%m-%d %H:%M:%S'
    # Format used for _timestamp in the 'doc' dict returned by the ESGF search (e.g. '1850-01-16T12:00:00Z')
    time_fmt_doc    = '%Y-%m-%dT%H:%M:%SZ' 
    time_fmt_doc_ms = '%Y-%m-%dT%H:%M:%S.%fZ' # sometimes the time string includes microseconds

    ld = []
    lp = d_pycmor2esgf.keys()
    all_versions = set()
    for dataset in l_retracted:
        doc = d_found[dataset]['doc']
        # Get the needed parameters for the retracted variables file
        d = {}
        for p in lp:
            d[p] = doc[ d_pycmor2esgf[p] ]
        for p in d:
            if   isinstance(d[p], (str, unicode)):
                pass
            elif isinstance(d[p], (list, tuple)):
                assert len(d[p]) == 1
                d[p] = d[p][0]
            else:
                assert False
        # Convert format of the timestamp string
        s = d['publish_time']
        if '.' in s.split(':')[-1]:
            # timestamp string includes microseconds
            fmt = time_fmt_doc_ms
        else:
            # timestamp string doesn't include microseconds
            fmt = time_fmt_doc
        dt = datetime.datetime.strptime(s, fmt)
        d['publish_time'] = dt.strftime(time_fmt)
        # Make version an integer rather than string ('doc' dict has it as string)
        d['version'] = int(d['version'])
        
        ld.append(d)
        all_versions.add(d['version'])
 
    # Convert ld into the format used in the retracted_vars.json file
    d_json = {}
    for p in lp:
        d_json[p] = {}
    for k,d in enumerate(ld):
        for p in lp:
            d_json[p][k] = d[p]
        
    # Save as json
    filename = 'retracted_vars.json'
    path = '.'
    filepath = os.path.join(path, filename)
    with open(filepath, 'w') as f:
        json.dump(d_json, f, indent=2, sort_keys=True)
        print('\nWrote retracted variables file ({0} datasets): {1}'.format(len(ld), filepath))    
        os.chmod(filepath, 0644)
    filename_rv = filename

    # Write a text file with info about how the json file was generated
    # (Note: another way to record this info would be as a "METADATA" key in
    # the json file itself, i.e. d_json['METADATA']. However this messes up
    # the loading of the json file by pandas in pycmor, and would prevent
    # recording the chksum of the json file.)
    lw = ['Info about {0}:'.format(filename_rv)]
    # Info about the retracted datasets
    lw.append('number of retracted datasets: {0}'.format(len(ld)))
    lw.append('date of latest retraction: {0}'.format(max(all_versions)))
    # Info about how the the json file was generated
    lw.append('created at: ' + datetime.datetime.utcnow().strftime('%d %b %Y %H:%M:%S UTC'))
    l = inspect.stack()[0]
    this_script = l[1]
    lw.append('created by: ' + this_script)
    # Record the commit of search_esgf under which this script was run.
    # (The check for uncommitted changes, done earlier, should ensure this
    # commit does represent the code used to generate the file.)
    lw.append('search_esgf commit: ' + latest_git_commit)
    # Record the chksum of the json file
    cmd = 'sha256sum ' + filename_rv
    filename = 'file_chksum'
    os.system('{} > {}'.format(cmd, filename))
    with open(filename, 'r') as f: file_chksum = f.read().split()[0].strip()
    os.system('rm ' + filename)
    lw.append('SHA256 chksum of {0}: {1}'.format(filename_rv, file_chksum))
    
    filename = '{0}_info.txt'.format(os.path.splitext(filename_rv)[0])
    w = '\n'.join(lw)
    filepath = os.path.join(path, filename)
    with open(filepath, 'w') as f:
        f.write(w)
        print('\nWrote info file: {0}'.format(filepath))    
        os.chmod(filepath, 0644)